First version
Generate RSS feeds from url Generate a basic website Upload to FTP
This commit is contained in:
parent
a508ad697e
commit
a0e1913939
5 changed files with 222 additions and 0 deletions
102
gen_feed.py
Normal file
102
gen_feed.py
Normal file
|
|
@ -0,0 +1,102 @@
|
||||||
|
import requests
|
||||||
|
import re
|
||||||
|
import datetime
|
||||||
|
import xml.dom.minidom
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from zoneinfo import ZoneInfo
|
||||||
|
from rfeed import *
|
||||||
|
|
||||||
|
|
||||||
|
regex_date = r"/(\d{4})/(\d{2})/(\d{2})/"
|
||||||
|
regex_time = r"à (\d+):(\d{2})"
|
||||||
|
|
||||||
|
class Article:
|
||||||
|
def __init__(self, title: str, link: str, is_paid: bool):
|
||||||
|
self.title = title
|
||||||
|
self.link = link
|
||||||
|
res = re.search(regex_date, link)
|
||||||
|
h_m = self._get_time()
|
||||||
|
self.date = datetime.datetime(int(res.group(1)), int(res.group(2)), int(res.group(3)), h_m[0], h_m[1])
|
||||||
|
self.date = self.date.replace(tzinfo=ZoneInfo("Europe/Paris"))
|
||||||
|
self.date = self.date.astimezone(datetime.timezone.utc)
|
||||||
|
self.is_paid = is_paid
|
||||||
|
|
||||||
|
def _get_time(self):
|
||||||
|
response = requests.get(self.link)
|
||||||
|
print(f" Retrieving {self.link} to get pub time...")
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f" Failed to get it ({response.status_code}, defaulting to 3AM)")
|
||||||
|
return (3, 0) # Default to 3:00 AM
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
publish = soup.find("span", class_="publish").text
|
||||||
|
res = re.search(regex_time, publish)
|
||||||
|
if not res:
|
||||||
|
print(f" Failed to parse it, defaulting to 3AM)")
|
||||||
|
return (3, 0) # Default to 3:00 AM
|
||||||
|
return (int(res.group(1)), int(res.group(2)))
|
||||||
|
|
||||||
|
|
||||||
|
def full_title(self):
|
||||||
|
paid = ""
|
||||||
|
if self.is_paid:
|
||||||
|
paid = "[€] "
|
||||||
|
return f'{paid}{self.title}'
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
paid = ""
|
||||||
|
if self.is_paid:
|
||||||
|
paid = "[€] "
|
||||||
|
return f'{paid}{self.title} [{self.date}]'
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return self.__str__()
|
||||||
|
|
||||||
|
|
||||||
|
def generate_feed(town_url, feed_url, feed_path):
|
||||||
|
main_domain = urlparse(town_url).netloc
|
||||||
|
|
||||||
|
print(f"Retrieving {town_url}...")
|
||||||
|
response = requests.get(town_url)
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"Failed to get url: {response.status_code}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
articles = []
|
||||||
|
|
||||||
|
small_articles = soup.find(id="ListUneMain").find_all("div", class_="wrapperClickArticle") + soup.find(id="ListUneSecondary").find_all("div", class_="wrapperClickArticle")
|
||||||
|
print(f"Found {len(small_articles)} articles...")
|
||||||
|
for small_article in small_articles:
|
||||||
|
t = small_article.find_all("span")[1].text
|
||||||
|
link = "https://" + main_domain + small_article.find("a").get("href")
|
||||||
|
is_paid = len(small_article.find_all("span", class_="flagPaid")) > 0
|
||||||
|
articles.append(Article(t, link, is_paid))
|
||||||
|
|
||||||
|
print("Generating feed...")
|
||||||
|
items = []
|
||||||
|
for a in articles:
|
||||||
|
item = Item(title = a.full_title(), link= a.link, guid = Guid(a.link), pubDate = a.date)
|
||||||
|
items.append(item)
|
||||||
|
feed = Feed(title = soup.title.string,
|
||||||
|
link = feed_url,
|
||||||
|
description = soup.title.string + " - Derniers articles",
|
||||||
|
language = "fr-FR",
|
||||||
|
lastBuildDate = datetime.datetime.now(datetime.UTC),
|
||||||
|
items = items)
|
||||||
|
|
||||||
|
print("Writing feed...")
|
||||||
|
with open(feed_path, 'w') as out_rss:
|
||||||
|
dom = xml.dom.minidom.parseString(feed.rss())
|
||||||
|
out_rss.write(dom.toprettyxml())
|
||||||
|
print("All done!")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
town_url = "https://www.leprogres.fr/c/rhone/69163-quincieux"
|
||||||
|
town_url = "https://www.ledauphine.com/c/isere/38425-saint-maurice-l-exil"
|
||||||
|
town_url = "https://www.ledauphine.com/c/isere/38124-corbelin"
|
||||||
|
feed_url = "https://www.onnula.fr/infos/38124-corbelin.xml"
|
||||||
|
feed_path = "38124-corbelin.xml"
|
||||||
|
generate_feed(town_url, feed_url, feed_path)
|
||||||
79
gen_website.py
Normal file
79
gen_website.py
Normal file
|
|
@ -0,0 +1,79 @@
|
||||||
|
import json
|
||||||
|
import locale
|
||||||
|
import paramiko
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
from gen_feed import generate_feed
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
def get_publisher(link):
|
||||||
|
if "ledauphine.com" in link:
|
||||||
|
return "Le Dauphiné Libéré"
|
||||||
|
if "bienpublic.com" in link:
|
||||||
|
return "Le Bien Public"
|
||||||
|
if "leprogres.fr" in link:
|
||||||
|
return "Le Progrès"
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def upload_to_sftp(towns):
|
||||||
|
|
||||||
|
host = os.environ["FTP_HOST"]
|
||||||
|
port = int(os.environ["FTP_PORT"])
|
||||||
|
username = os.environ["FTP_USER"]
|
||||||
|
password = os.environ["FTP_PASSWORD"]
|
||||||
|
|
||||||
|
transport = paramiko.Transport((host, port))
|
||||||
|
transport.connect(username=username, password=password)
|
||||||
|
|
||||||
|
sftp = paramiko.SFTPClient.from_transport(transport)
|
||||||
|
|
||||||
|
sftp.put("index.html", "www/index.html")
|
||||||
|
for t in towns:
|
||||||
|
sftp.put(t[3], f"www/{t[3]}")
|
||||||
|
|
||||||
|
sftp.close()
|
||||||
|
transport.close()
|
||||||
|
|
||||||
|
|
||||||
|
def generate_web():
|
||||||
|
# Load from env
|
||||||
|
urls = os.environ["URLS"].split(",")
|
||||||
|
website = os.environ["WEBSITE"]
|
||||||
|
|
||||||
|
towns = []
|
||||||
|
print("Getting articles for each link...")
|
||||||
|
for link in urls:
|
||||||
|
town_id = link.rsplit('/', 1)[-1]
|
||||||
|
town_xml = town_id + ".xml"
|
||||||
|
town_rss = website + "/" + town_xml
|
||||||
|
if not generate_feed(link, town_rss, town_xml):
|
||||||
|
print(f"Failed to generate feed for {town_id}")
|
||||||
|
return False
|
||||||
|
towns.append((town_id, town_rss, get_publisher(link), town_xml))
|
||||||
|
|
||||||
|
print("Updating HTML template...")
|
||||||
|
with open("template.html", 'r', encoding="utf-8") as template:
|
||||||
|
data = template.read()
|
||||||
|
|
||||||
|
li = ""
|
||||||
|
for town in towns:
|
||||||
|
li += f'<li><a href="{town[1]}">{town[0]}</a> ({town[2]})</li>'
|
||||||
|
data = data.replace("$$$TOWN_LIST$$$", li)
|
||||||
|
|
||||||
|
locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')
|
||||||
|
now = datetime.now()
|
||||||
|
data = data.replace("$$$GENERATED_DATETIME$$$", now.strftime("%H:%M le %A %d %B %Y"))
|
||||||
|
with open("index.html", 'w', encoding="utf-8") as out:
|
||||||
|
out.write(data)
|
||||||
|
|
||||||
|
print("Uploading to FTP...")
|
||||||
|
upload_to_sftp(towns)
|
||||||
|
|
||||||
|
print("All done!")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
load_dotenv()
|
||||||
|
generate_web()
|
||||||
5
requirements.txt
Normal file
5
requirements.txt
Normal file
|
|
@ -0,0 +1,5 @@
|
||||||
|
rfeed
|
||||||
|
beautifulsoup4
|
||||||
|
requests
|
||||||
|
paramiko
|
||||||
|
python-dotenv
|
||||||
6
template.env
Normal file
6
template.env
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
FTP_HOST=
|
||||||
|
FTP_PORT=
|
||||||
|
FTP_USER=
|
||||||
|
FTP_PASSWORD=
|
||||||
|
WEBSITE=
|
||||||
|
URLS=
|
||||||
30
template.html
Normal file
30
template.html
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Flux RSS pour villes et villages couverts par le groupe EBRA</title>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
||||||
|
<style>
|
||||||
|
body {
|
||||||
|
background-color: rgb(245, 238, 220);
|
||||||
|
text-align: center;
|
||||||
|
font-family: Arial, Helvetica, sans-serif;
|
||||||
|
}
|
||||||
|
footer {
|
||||||
|
font-size: small;
|
||||||
|
color: rgb(65, 65, 65);
|
||||||
|
}
|
||||||
|
ul {
|
||||||
|
list-style-type: none;
|
||||||
|
padding: 0;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Flux RSS</h1>
|
||||||
|
<h2>Listes des villes couvertes par cette instance</h2>
|
||||||
|
<ul>
|
||||||
|
$$$TOWN_LIST$$$
|
||||||
|
</ul>
|
||||||
|
<footer>Généré à $$$GENERATED_DATETIME$$$ - <a href="https://onnula.fr/forgejo/cyril/RSSEbra">Source</a></footer>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
Loading…
Add table
Add a link
Reference in a new issue