RSSEbra/gen_feed.py

import requests
import re
import datetime
import xml.dom.minidom
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from zoneinfo import ZoneInfo
from rfeed import *


regex_date = r"/(\d{4})/(\d{2})/(\d{2})/"
regex_time = r"à (\d+):(\d{2})"

class Article:
    def __init__(self, title: str, link: str, is_paid: bool):
        self.title = title
        self.link = link
        res = re.search(regex_date, link)
        h_m = self._get_time()
        self.date = datetime.datetime(int(res.group(1)), int(res.group(2)), int(res.group(3)), h_m[0], h_m[1])
        self.date = self.date.replace(tzinfo=ZoneInfo("Europe/Paris"))
        self.date = self.date.astimezone(datetime.timezone.utc)
        self.is_paid = is_paid
    
    def _get_time(self):
        response = requests.get(self.link)
        print(f"   Retrieving {self.link} to get pub time...")
        if response.status_code != 200:
            print(f"   Failed to get it ({response.status_code}, defaulting to 3AM)")
            return (3, 0)  # Default to 3:00 AM
        soup = BeautifulSoup(response.text, 'html.parser')
        publish = soup.find("span", class_="publish").text
        res = re.search(regex_time, publish)
        if not res:
            print(f"   Failed to parse it, defaulting to 3AM)")
            return (3, 0)  # Default to 3:00 AM
        return (int(res.group(1)), int(res.group(2)))


    def full_title(self):
        paid = ""
        if self.is_paid:
            paid = "[€] "
        return f'{paid}{self.title}'

    def __str__(self):
        paid = ""
        if self.is_paid:
            paid = "[€] "
        return f'{paid}{self.title} [{self.date}]'
    
    def __repr__(self):
        return self.__str__()


def generate_feed(town_url, feed_url, feed_path):
    main_domain = urlparse(town_url).netloc

    print(f"Retrieving {town_url}...")
    response = requests.get(town_url)
    if response.status_code != 200:
        print(f"Failed to get url: {response.status_code}")
        return False

    soup = BeautifulSoup(response.text, 'html.parser')
    articles = []

    small_articles = soup.find(id="ListUneMain").find_all("div", class_="wrapperClickArticle") + soup.find(id="ListUneSecondary").find_all("div", class_="wrapperClickArticle")
    print(f"Found {len(small_articles)} articles...")
    for small_article in small_articles:
        t = small_article.find_all("span")[1].text
        link = "https://" + main_domain + small_article.find("a").get("href")
        is_paid = len(small_article.find_all("span", class_="flagPaid")) > 0
        articles.append(Article(t, link, is_paid))

    print("Generating feed...")
    items = []
    for a in articles:
        item = Item(title = a.full_title(), link= a.link, guid = Guid(a.link), pubDate = a.date)
        items.append(item)
    feed = Feed(title = soup.title.string,
        link = feed_url,
        description = soup.title.string + " - Derniers articles",
        language = "fr-FR",
        lastBuildDate = datetime.datetime.now(datetime.UTC),
        items = items)
    
    print("Writing feed...")
    with open(feed_path, 'w') as out_rss:
        dom = xml.dom.minidom.parseString(feed.rss())
        out_rss.write(dom.toprettyxml())
    print("All done!")
    return True


if __name__ == '__main__':
    town_url = "https://www.leprogres.fr/c/rhone/69163-quincieux"
    town_url = "https://www.ledauphine.com/c/isere/38425-saint-maurice-l-exil"
    town_url = "https://www.ledauphine.com/c/isere/38124-corbelin"
    feed_url = "https://www.onnula.fr/infos/38124-corbelin.xml"
    feed_path = "38124-corbelin.xml"
    generate_feed(town_url, feed_url, feed_path)
First version Generate RSS feeds from url Generate a basic website Upload to FTP 2026-05-08 16:22:00 +02:00			`import requests`
			`import re`
			`import datetime`
			`import xml.dom.minidom`
			`from bs4 import BeautifulSoup`
			`from urllib.parse import urlparse`
			`from zoneinfo import ZoneInfo`
			`from rfeed import *`


			`regex_date = r"/(\d{4})/(\d{2})/(\d{2})/"`
			`regex_time = r"à (\d+):(\d{2})"`

			`class Article:`
			`def __init__(self, title: str, link: str, is_paid: bool):`
			`self.title = title`
			`self.link = link`
			`res = re.search(regex_date, link)`
			`h_m = self._get_time()`
			`self.date = datetime.datetime(int(res.group(1)), int(res.group(2)), int(res.group(3)), h_m[0], h_m[1])`
			`self.date = self.date.replace(tzinfo=ZoneInfo("Europe/Paris"))`
			`self.date = self.date.astimezone(datetime.timezone.utc)`
			`self.is_paid = is_paid`

			`def _get_time(self):`
			`response = requests.get(self.link)`
			`print(f" Retrieving {self.link} to get pub time...")`
			`if response.status_code != 200:`
			`print(f" Failed to get it ({response.status_code}, defaulting to 3AM)")`
			`return (3, 0) # Default to 3:00 AM`
			`soup = BeautifulSoup(response.text, 'html.parser')`
			`publish = soup.find("span", class_="publish").text`
			`res = re.search(regex_time, publish)`
			`if not res:`
			`print(f" Failed to parse it, defaulting to 3AM)")`
			`return (3, 0) # Default to 3:00 AM`
			`return (int(res.group(1)), int(res.group(2)))`


			`def full_title(self):`
			`paid = ""`
			`if self.is_paid:`
			`paid = "[€] "`
			`return f'{paid}{self.title}'`

			`def __str__(self):`
			`paid = ""`
			`if self.is_paid:`
			`paid = "[€] "`
			`return f'{paid}{self.title} [{self.date}]'`

			`def __repr__(self):`
			`return self.__str__()`


			`def generate_feed(town_url, feed_url, feed_path):`
			`main_domain = urlparse(town_url).netloc`

			`print(f"Retrieving {town_url}...")`
			`response = requests.get(town_url)`
			`if response.status_code != 200:`
			`print(f"Failed to get url: {response.status_code}")`
			`return False`

			`soup = BeautifulSoup(response.text, 'html.parser')`
			`articles = []`

			`small_articles = soup.find(id="ListUneMain").find_all("div", class_="wrapperClickArticle") + soup.find(id="ListUneSecondary").find_all("div", class_="wrapperClickArticle")`
			`print(f"Found {len(small_articles)} articles...")`
			`for small_article in small_articles:`
			`t = small_article.find_all("span")[1].text`
			`link = "https://" + main_domain + small_article.find("a").get("href")`
			`is_paid = len(small_article.find_all("span", class_="flagPaid")) > 0`
			`articles.append(Article(t, link, is_paid))`

			`print("Generating feed...")`
			`items = []`
			`for a in articles:`
			`item = Item(title = a.full_title(), link= a.link, guid = Guid(a.link), pubDate = a.date)`
			`items.append(item)`
			`feed = Feed(title = soup.title.string,`
			`link = feed_url,`
			`description = soup.title.string + " - Derniers articles",`
			`language = "fr-FR",`
			`lastBuildDate = datetime.datetime.now(datetime.UTC),`
			`items = items)`

			`print("Writing feed...")`
			`with open(feed_path, 'w') as out_rss:`
			`dom = xml.dom.minidom.parseString(feed.rss())`
			`out_rss.write(dom.toprettyxml())`
			`print("All done!")`
			`return True`


			`if __name__ == '__main__':`
			`town_url = "https://www.leprogres.fr/c/rhone/69163-quincieux"`
			`town_url = "https://www.ledauphine.com/c/isere/38425-saint-maurice-l-exil"`
			`town_url = "https://www.ledauphine.com/c/isere/38124-corbelin"`
			`feed_url = "https://www.onnula.fr/infos/38124-corbelin.xml"`
			`feed_path = "38124-corbelin.xml"`
			`generate_feed(town_url, feed_url, feed_path)`