From a0e1913939fee960b141b21d1d1e2646d7da03ba Mon Sep 17 00:00:00 2001 From: Cyril NOVEL <5690282+cnovel@users.noreply.github.com> Date: Fri, 8 May 2026 16:22:00 +0200 Subject: [PATCH] First version Generate RSS feeds from url Generate a basic website Upload to FTP --- gen_feed.py | 102 +++++++++++++++++++++++++++++++++++++++++++++++ gen_website.py | 79 ++++++++++++++++++++++++++++++++++++ requirements.txt | 5 +++ template.env | 6 +++ template.html | 30 ++++++++++++++ 5 files changed, 222 insertions(+) create mode 100644 gen_feed.py create mode 100644 gen_website.py create mode 100644 requirements.txt create mode 100644 template.env create mode 100644 template.html diff --git a/gen_feed.py b/gen_feed.py new file mode 100644 index 0000000..dfeabf1 --- /dev/null +++ b/gen_feed.py @@ -0,0 +1,102 @@ +import requests +import re +import datetime +import xml.dom.minidom +from bs4 import BeautifulSoup +from urllib.parse import urlparse +from zoneinfo import ZoneInfo +from rfeed import * + + +regex_date = r"/(\d{4})/(\d{2})/(\d{2})/" +regex_time = r"à (\d+):(\d{2})" + +class Article: + def __init__(self, title: str, link: str, is_paid: bool): + self.title = title + self.link = link + res = re.search(regex_date, link) + h_m = self._get_time() + self.date = datetime.datetime(int(res.group(1)), int(res.group(2)), int(res.group(3)), h_m[0], h_m[1]) + self.date = self.date.replace(tzinfo=ZoneInfo("Europe/Paris")) + self.date = self.date.astimezone(datetime.timezone.utc) + self.is_paid = is_paid + + def _get_time(self): + response = requests.get(self.link) + print(f" Retrieving {self.link} to get pub time...") + if response.status_code != 200: + print(f" Failed to get it ({response.status_code}, defaulting to 3AM)") + return (3, 0) # Default to 3:00 AM + soup = BeautifulSoup(response.text, 'html.parser') + publish = soup.find("span", class_="publish").text + res = re.search(regex_time, publish) + if not res: + print(f" Failed to parse it, defaulting to 3AM)") + return (3, 0) # Default to 3:00 AM + return (int(res.group(1)), int(res.group(2))) + + + def full_title(self): + paid = "" + if self.is_paid: + paid = "[€] " + return f'{paid}{self.title}' + + def __str__(self): + paid = "" + if self.is_paid: + paid = "[€] " + return f'{paid}{self.title} [{self.date}]' + + def __repr__(self): + return self.__str__() + + +def generate_feed(town_url, feed_url, feed_path): + main_domain = urlparse(town_url).netloc + + print(f"Retrieving {town_url}...") + response = requests.get(town_url) + if response.status_code != 200: + print(f"Failed to get url: {response.status_code}") + return False + + soup = BeautifulSoup(response.text, 'html.parser') + articles = [] + + small_articles = soup.find(id="ListUneMain").find_all("div", class_="wrapperClickArticle") + soup.find(id="ListUneSecondary").find_all("div", class_="wrapperClickArticle") + print(f"Found {len(small_articles)} articles...") + for small_article in small_articles: + t = small_article.find_all("span")[1].text + link = "https://" + main_domain + small_article.find("a").get("href") + is_paid = len(small_article.find_all("span", class_="flagPaid")) > 0 + articles.append(Article(t, link, is_paid)) + + print("Generating feed...") + items = [] + for a in articles: + item = Item(title = a.full_title(), link= a.link, guid = Guid(a.link), pubDate = a.date) + items.append(item) + feed = Feed(title = soup.title.string, + link = feed_url, + description = soup.title.string + " - Derniers articles", + language = "fr-FR", + lastBuildDate = datetime.datetime.now(datetime.UTC), + items = items) + + print("Writing feed...") + with open(feed_path, 'w') as out_rss: + dom = xml.dom.minidom.parseString(feed.rss()) + out_rss.write(dom.toprettyxml()) + print("All done!") + return True + + +if __name__ == '__main__': + town_url = "https://www.leprogres.fr/c/rhone/69163-quincieux" + town_url = "https://www.ledauphine.com/c/isere/38425-saint-maurice-l-exil" + town_url = "https://www.ledauphine.com/c/isere/38124-corbelin" + feed_url = "https://www.onnula.fr/infos/38124-corbelin.xml" + feed_path = "38124-corbelin.xml" + generate_feed(town_url, feed_url, feed_path) \ No newline at end of file diff --git a/gen_website.py b/gen_website.py new file mode 100644 index 0000000..90491f9 --- /dev/null +++ b/gen_website.py @@ -0,0 +1,79 @@ +import json +import locale +import paramiko +import os +from datetime import datetime +from gen_feed import generate_feed +from dotenv import load_dotenv + +def get_publisher(link): + if "ledauphine.com" in link: + return "Le Dauphiné Libéré" + if "bienpublic.com" in link: + return "Le Bien Public" + if "leprogres.fr" in link: + return "Le Progrès" + return "" + + +def upload_to_sftp(towns): + + host = os.environ["FTP_HOST"] + port = int(os.environ["FTP_PORT"]) + username = os.environ["FTP_USER"] + password = os.environ["FTP_PASSWORD"] + + transport = paramiko.Transport((host, port)) + transport.connect(username=username, password=password) + + sftp = paramiko.SFTPClient.from_transport(transport) + + sftp.put("index.html", "www/index.html") + for t in towns: + sftp.put(t[3], f"www/{t[3]}") + + sftp.close() + transport.close() + + +def generate_web(): + # Load from env + urls = os.environ["URLS"].split(",") + website = os.environ["WEBSITE"] + + towns = [] + print("Getting articles for each link...") + for link in urls: + town_id = link.rsplit('/', 1)[-1] + town_xml = town_id + ".xml" + town_rss = website + "/" + town_xml + if not generate_feed(link, town_rss, town_xml): + print(f"Failed to generate feed for {town_id}") + return False + towns.append((town_id, town_rss, get_publisher(link), town_xml)) + + print("Updating HTML template...") + with open("template.html", 'r', encoding="utf-8") as template: + data = template.read() + + li = "" + for town in towns: + li += f'