First version
Generate RSS feeds from url Generate a basic website Upload to FTP
This commit is contained in:
parent
a508ad697e
commit
a0e1913939
5 changed files with 222 additions and 0 deletions
102
gen_feed.py
Normal file
102
gen_feed.py
Normal file
|
|
@ -0,0 +1,102 @@
|
|||
import requests
|
||||
import re
|
||||
import datetime
|
||||
import xml.dom.minidom
|
||||
from bs4 import BeautifulSoup
|
||||
from urllib.parse import urlparse
|
||||
from zoneinfo import ZoneInfo
|
||||
from rfeed import *
|
||||
|
||||
|
||||
regex_date = r"/(\d{4})/(\d{2})/(\d{2})/"
|
||||
regex_time = r"à (\d+):(\d{2})"
|
||||
|
||||
class Article:
|
||||
def __init__(self, title: str, link: str, is_paid: bool):
|
||||
self.title = title
|
||||
self.link = link
|
||||
res = re.search(regex_date, link)
|
||||
h_m = self._get_time()
|
||||
self.date = datetime.datetime(int(res.group(1)), int(res.group(2)), int(res.group(3)), h_m[0], h_m[1])
|
||||
self.date = self.date.replace(tzinfo=ZoneInfo("Europe/Paris"))
|
||||
self.date = self.date.astimezone(datetime.timezone.utc)
|
||||
self.is_paid = is_paid
|
||||
|
||||
def _get_time(self):
|
||||
response = requests.get(self.link)
|
||||
print(f" Retrieving {self.link} to get pub time...")
|
||||
if response.status_code != 200:
|
||||
print(f" Failed to get it ({response.status_code}, defaulting to 3AM)")
|
||||
return (3, 0) # Default to 3:00 AM
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
publish = soup.find("span", class_="publish").text
|
||||
res = re.search(regex_time, publish)
|
||||
if not res:
|
||||
print(f" Failed to parse it, defaulting to 3AM)")
|
||||
return (3, 0) # Default to 3:00 AM
|
||||
return (int(res.group(1)), int(res.group(2)))
|
||||
|
||||
|
||||
def full_title(self):
|
||||
paid = ""
|
||||
if self.is_paid:
|
||||
paid = "[€] "
|
||||
return f'{paid}{self.title}'
|
||||
|
||||
def __str__(self):
|
||||
paid = ""
|
||||
if self.is_paid:
|
||||
paid = "[€] "
|
||||
return f'{paid}{self.title} [{self.date}]'
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
|
||||
def generate_feed(town_url, feed_url, feed_path):
|
||||
main_domain = urlparse(town_url).netloc
|
||||
|
||||
print(f"Retrieving {town_url}...")
|
||||
response = requests.get(town_url)
|
||||
if response.status_code != 200:
|
||||
print(f"Failed to get url: {response.status_code}")
|
||||
return False
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
articles = []
|
||||
|
||||
small_articles = soup.find(id="ListUneMain").find_all("div", class_="wrapperClickArticle") + soup.find(id="ListUneSecondary").find_all("div", class_="wrapperClickArticle")
|
||||
print(f"Found {len(small_articles)} articles...")
|
||||
for small_article in small_articles:
|
||||
t = small_article.find_all("span")[1].text
|
||||
link = "https://" + main_domain + small_article.find("a").get("href")
|
||||
is_paid = len(small_article.find_all("span", class_="flagPaid")) > 0
|
||||
articles.append(Article(t, link, is_paid))
|
||||
|
||||
print("Generating feed...")
|
||||
items = []
|
||||
for a in articles:
|
||||
item = Item(title = a.full_title(), link= a.link, guid = Guid(a.link), pubDate = a.date)
|
||||
items.append(item)
|
||||
feed = Feed(title = soup.title.string,
|
||||
link = feed_url,
|
||||
description = soup.title.string + " - Derniers articles",
|
||||
language = "fr-FR",
|
||||
lastBuildDate = datetime.datetime.now(datetime.UTC),
|
||||
items = items)
|
||||
|
||||
print("Writing feed...")
|
||||
with open(feed_path, 'w') as out_rss:
|
||||
dom = xml.dom.minidom.parseString(feed.rss())
|
||||
out_rss.write(dom.toprettyxml())
|
||||
print("All done!")
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
town_url = "https://www.leprogres.fr/c/rhone/69163-quincieux"
|
||||
town_url = "https://www.ledauphine.com/c/isere/38425-saint-maurice-l-exil"
|
||||
town_url = "https://www.ledauphine.com/c/isere/38124-corbelin"
|
||||
feed_url = "https://www.onnula.fr/infos/38124-corbelin.xml"
|
||||
feed_path = "38124-corbelin.xml"
|
||||
generate_feed(town_url, feed_url, feed_path)
|
||||
Loading…
Add table
Add a link
Reference in a new issue