import requests import re import datetime import xml.dom.minidom from bs4 import BeautifulSoup from urllib.parse import urlparse from zoneinfo import ZoneInfo from rfeed import * regex_date = r"/(\d{4})/(\d{2})/(\d{2})/" regex_time = r"à (\d+):(\d{2})" class Article: def __init__(self, title: str, link: str, is_paid: bool): self.title = title self.link = link res = re.search(regex_date, link) h_m = self._get_time() self.date = datetime.datetime(int(res.group(1)), int(res.group(2)), int(res.group(3)), h_m[0], h_m[1]) self.date = self.date.replace(tzinfo=ZoneInfo("Europe/Paris")) self.date = self.date.astimezone(datetime.timezone.utc) self.is_paid = is_paid def _get_time(self): response = requests.get(self.link) print(f" Retrieving {self.link} to get pub time...") if response.status_code != 200: print(f" Failed to get it ({response.status_code}, defaulting to 3AM)") return (3, 0) # Default to 3:00 AM soup = BeautifulSoup(response.text, 'html.parser') publish = soup.find("span", class_="publish").text res = re.search(regex_time, publish) if not res: print(f" Failed to parse it, defaulting to 3AM)") return (3, 0) # Default to 3:00 AM return (int(res.group(1)), int(res.group(2))) def full_title(self): paid = "" if self.is_paid: paid = "[€] " return f'{paid}{self.title}' def __str__(self): paid = "" if self.is_paid: paid = "[€] " return f'{paid}{self.title} [{self.date}]' def __repr__(self): return self.__str__() def generate_feed(town_url, feed_url, feed_path): main_domain = urlparse(town_url).netloc print(f"Retrieving {town_url}...") response = requests.get(town_url) if response.status_code != 200: print(f"Failed to get url: {response.status_code}") return False soup = BeautifulSoup(response.text, 'html.parser') articles = [] small_articles = soup.find(id="ListUneMain").find_all("div", class_="wrapperClickArticle") + soup.find(id="ListUneSecondary").find_all("div", class_="wrapperClickArticle") print(f"Found {len(small_articles)} articles...") for small_article in small_articles: t = small_article.find_all("span")[1].text link = "https://" + main_domain + small_article.find("a").get("href") is_paid = len(small_article.find_all("span", class_="flagPaid")) > 0 articles.append(Article(t, link, is_paid)) print("Generating feed...") items = [] for a in articles: item = Item(title = a.full_title(), link= a.link, guid = Guid(a.link), pubDate = a.date) items.append(item) feed = Feed(title = soup.title.string, link = feed_url, description = soup.title.string + " - Derniers articles", language = "fr-FR", lastBuildDate = datetime.datetime.now(datetime.UTC), items = items) print("Writing feed...") with open(feed_path, 'w') as out_rss: dom = xml.dom.minidom.parseString(feed.rss()) out_rss.write(dom.toprettyxml()) print("All done!") return True if __name__ == '__main__': town_url = "https://www.leprogres.fr/c/rhone/69163-quincieux" town_url = "https://www.ledauphine.com/c/isere/38425-saint-maurice-l-exil" town_url = "https://www.ledauphine.com/c/isere/38124-corbelin" feed_url = "https://www.onnula.fr/infos/38124-corbelin.xml" feed_path = "38124-corbelin.xml" generate_feed(town_url, feed_url, feed_path)