kvssachsen2atom - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repository
(HTM) git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/brcon2023-hackathons
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) Tags
---
kvssachsen2atom (3188B)
---
1 #!/usr/bin/env python
2 # coding=utf-8
3 #
4 # Copy me if you can.
5 # by 20h
6 #
7
8 import os
9 import sys
10 import getopt
11
12 from selenium import webdriver
13 from selenium.webdriver.chrome.options import Options as chromeoptions
14 from selenium.webdriver.support.ui import WebDriverWait
15 from selenium.webdriver.support import expected_conditions as EC
16 from selenium.webdriver.common.by import By
17
18 from datetime import datetime
19 import pytz
20
21 def usage(app):
22 app = os.path.basename(app)
23 sys.stderr.write("usage: %s [-h] URI\n" % (app))
24 sys.exit(1)
25
26 def main(args):
27 try:
28 opts, largs = getopt.getopt(args[1:], "h")
29 except getopt.GetoptError as err:
30 print(str(err))
31 usage(args[0])
32
33 for o, a in opts:
34 if o == "-h":
35 usage(args[0])
36 else:
37 assert False, "unhandled option"
38
39 if len(largs) < 1:
40 usage(args[0])
41
42 link = largs[0]
43
44 options = chromeoptions()
45 chromearguments = [
46 "headless",
47 "no-sandbox",
48 "disable-extensions",
49 "disable-dev-shm-usage",
50 "start-maximized",
51 "window-size=1900,1080",
52 "disable-gpu"
53 ]
54 for carg in chromearguments:
55 options.add_argument(carg)
56
57 driver = webdriver.Chrome(options=options)
58 driver.get(link)
59
60 isnews = WebDriverWait(driver=driver, timeout=60).until(
61 EC.presence_of_element_located((By.XPATH,
62 "//div[@data-last-letter]")
63 )
64 )
65 newslist = driver.find_elements(By.XPATH, "//div[@data-filter-target=\"list\"]")[0]
66
67 title = driver.find_elements(By.XPATH, "//meta[@property=\"og:title\"]")[0].get_attribute("content")
68 description = title
69 globaltags = ""
70
71 print("""<?xml version="1.0" encoding="utf-8"?>""")
72 print("""<feed xmlns="http://www.w3.org/2005/Atom">""")
73 print("\t<title><![CDATA[%s]]></title>" % (title))
74 print("\t<subtitle><![CDATA[%s]]></subtitle>" % (description))
75 print("\t<id>%s</id>" % (link))
76 print("\t<link href=\"%s\" rel=\"self\" />" % (link))
77 print("\t<link href=\"%s\" />" % (link))
78
79 utcnow = datetime.now(pytz.utc)
80 print("\t<updated>%s</updated>" % (utcnow.isoformat()))
81
82 articles = newslist.find_elements(By.XPATH, "./div")
83 baselink = "/".join(link.split("/", 3)[:-1])
84 for article in articles[::-1]:
85 link = article.find_elements(By.XPATH, "./a")[0]
86 plink = link.get_attribute("href")
87 if not plink.startswith("http"):
88 plink = "%s/%s" % (baselink, plink)
89 ptitle = link.get_attribute("data-title")
90 pcontent = article.text
91 pauthor = "sachsen@kvsachsen.de"
92
93 # Normalize datetime.
94 updateds = article.find_elements(By.XPATH, ".//time")[0].text
95 try:
96 dtupdated = datetime.strptime(updateds, "%d.%m.%Y")
97 except ValueError:
98 continue
99
100 dtupdated = dtupdated.replace(hour=12, minute=0,\
101 second=0, tzinfo=pytz.utc)
102 if dtupdated.year > utcnow.year:
103 dtupdated = dtupdated.replace(year=utcnow.year)
104 pupdated = dtupdated
105
106 print("\t<entry>")
107 print("\t\t<id>%s</id>" % (plink))
108 print("\t\t<title><![CDATA[%s]]></title>" % (ptitle))
109 print("\t\t<link href=\"%s\" />" % (plink))
110 print("\t\t<author><name>%s</name></author>" % (pauthor))
111 print("\t\t<updated>%s</updated>" % (pupdated.isoformat()))
112 print("\t\t<content><![CDATA[%s]]></content>" % (pcontent))
113 print("\t</entry>")
114
115 print("</feed>")
116
117 return 0
118
119 if __name__ == "__main__":
120 sys.exit(main(sys.argv))
121