RSSEbra/gen_feed.py

102 lines
3.6 KiB
Python
Raw Normal View History

import requests
import re
import datetime
import xml.dom.minidom
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from zoneinfo import ZoneInfo
from rfeed import *
regex_date = r"/(\d{4})/(\d{2})/(\d{2})/"
regex_time = r"à (\d+):(\d{2})"
class Article:
def __init__(self, title: str, link: str, is_paid: bool):
self.title = title
self.link = link
res = re.search(regex_date, link)
h_m = self._get_time()
self.date = datetime.datetime(int(res.group(1)), int(res.group(2)), int(res.group(3)), h_m[0], h_m[1])
self.date = self.date.replace(tzinfo=ZoneInfo("Europe/Paris"))
self.date = self.date.astimezone(datetime.timezone.utc)
self.is_paid = is_paid
def _get_time(self):
response = requests.get(self.link)
print(f" Retrieving {self.link} to get pub time...")
if response.status_code != 200:
print(f" Failed to get it ({response.status_code}, defaulting to 3AM)")
return (3, 0) # Default to 3:00 AM
soup = BeautifulSoup(response.text, 'html.parser')
publish = soup.find("span", class_="publish").text
res = re.search(regex_time, publish)
if not res:
print(f" Failed to parse it, defaulting to 3AM)")
return (3, 0) # Default to 3:00 AM
return (int(res.group(1)), int(res.group(2)))
def full_title(self):
paid = ""
if self.is_paid:
paid = "[€] "
return f'{paid}{self.title}'
def __str__(self):
paid = ""
if self.is_paid:
paid = "[€] "
return f'{paid}{self.title} [{self.date}]'
def __repr__(self):
return self.__str__()
def generate_feed(town_url, feed_url, feed_path):
main_domain = urlparse(town_url).netloc
print(f"Retrieving {town_url}...")
response = requests.get(town_url)
if response.status_code != 200:
print(f"Failed to get url: {response.status_code}")
return False
soup = BeautifulSoup(response.text, 'html.parser')
articles = []
small_articles = soup.find(id="ListUneMain").find_all("div", class_="wrapperClickArticle") + soup.find(id="ListUneSecondary").find_all("div", class_="wrapperClickArticle")
print(f"Found {len(small_articles)} articles...")
for small_article in small_articles:
t = small_article.find_all("span")[1].text
link = "https://" + main_domain + small_article.find("a").get("href")
is_paid = len(small_article.find_all("span", class_="flagPaid")) > 0
articles.append(Article(t, link, is_paid))
print("Generating feed...")
items = []
for a in articles:
item = Item(title = a.full_title(), link= a.link, guid = Guid(a.link), pubDate = a.date)
items.append(item)
feed = Feed(title = soup.title.string,
link = feed_url,
description = soup.title.string + " - Derniers articles",
language = "fr-FR",
lastBuildDate = datetime.datetime.now(datetime.UTC),
items = items)
print("Writing feed...")
with open(feed_path, 'w') as out_rss:
dom = xml.dom.minidom.parseString(feed.rss())
out_rss.write(dom.toprettyxml())
print("All done!")
return True
if __name__ == '__main__':
town_url = "https://www.leprogres.fr/c/rhone/69163-quincieux"
town_url = "https://www.ledauphine.com/c/isere/38425-saint-maurice-l-exil"
town_url = "https://www.ledauphine.com/c/isere/38124-corbelin"
feed_url = "https://www.onnula.fr/infos/38124-corbelin.xml"
feed_path = "38124-corbelin.xml"
generate_feed(town_url, feed_url, feed_path)