Add example selenium script for the atom hackathon. - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repository
(HTM) git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/brcon2023-hackathons
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) Tags
---
(DIR) commit a7cd0c547c792f74b7784cc0a8c806380a28ca2f
(DIR) parent 2922c09dc4919dcea4ac331bbaa4e373ba4ccc4a
(HTM) Author: Christoph Lohmann <20h@r-36.net>
Date: Thu, 10 Aug 2023 16:10:01 +0200
Add example selenium script for the atom hackathon.
Diffstat:
A sfeed-atom/kvssachsen2atom | 121 +++++++++++++++++++++++++++++++
1 file changed, 121 insertions(+), 0 deletions(-)
---
(DIR) diff --git a/sfeed-atom/kvssachsen2atom b/sfeed-atom/kvssachsen2atom
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+# coding=utf-8
+#
+# Copy me if you can.
+# by 20h
+#
+
+import os
+import sys
+import getopt
+
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options as chromeoptions
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+
+from datetime import datetime
+import pytz
+
+def usage(app):
+ app = os.path.basename(app)
+ sys.stderr.write("usage: %s [-h] URI\n" % (app))
+ sys.exit(1)
+
+def main(args):
+ try:
+ opts, largs = getopt.getopt(args[1:], "h")
+ except getopt.GetoptError as err:
+ print(str(err))
+ usage(args[0])
+
+ for o, a in opts:
+ if o == "-h":
+ usage(args[0])
+ else:
+ assert False, "unhandled option"
+
+ if len(largs) < 1:
+ usage(args[0])
+
+ link = largs[0]
+
+ options = chromeoptions()
+ chromearguments = [
+ "headless",
+ "no-sandbox",
+ "disable-extensions",
+ "disable-dev-shm-usage",
+ "start-maximized",
+ "window-size=1900,1080",
+ "disable-gpu"
+ ]
+ for carg in chromearguments:
+ options.add_argument(carg)
+
+ driver = webdriver.Chrome(options=options)
+ driver.get(link)
+
+ isnews = WebDriverWait(driver=driver, timeout=60).until(
+ EC.presence_of_element_located((By.XPATH,
+ "//div[@data-last-letter]")
+ )
+ )
+ newslist = driver.find_elements(By.XPATH, "//div[@data-filter-target=\"list\"]")[0]
+
+ title = driver.find_elements(By.XPATH, "//meta[@property=\"og:title\"]")[0].get_attribute("content")
+ description = title
+ globaltags = ""
+
+ print("""<?xml version="1.0" encoding="utf-8"?>""")
+ print("""<feed xmlns="http://www.w3.org/2005/Atom">""")
+ print("\t<title><![CDATA[%s]]></title>" % (title))
+ print("\t<subtitle><![CDATA[%s]]></subtitle>" % (description))
+ print("\t<id>%s</id>" % (link))
+ print("\t<link href=\"%s\" rel=\"self\" />" % (link))
+ print("\t<link href=\"%s\" />" % (link))
+
+ utcnow = datetime.now(pytz.utc)
+ print("\t<updated>%s</updated>" % (utcnow.isoformat()))
+
+ articles = newslist.find_elements(By.XPATH, "./div")
+ baselink = "/".join(link.split("/", 3)[:-1])
+ for article in articles[::-1]:
+ link = article.find_elements(By.XPATH, "./a")[0]
+ plink = link.get_attribute("href")
+ if not plink.startswith("http"):
+ plink = "%s/%s" % (baselink, plink)
+ ptitle = link.get_attribute("data-title")
+ pcontent = article.text
+ pauthor = "sachsen@kvsachsen.de"
+
+ # Normalize datetime.
+ updateds = article.find_elements(By.XPATH, ".//time")[0].text
+ try:
+ dtupdated = datetime.strptime(updateds, "%d.%m.%Y")
+ except ValueError:
+ continue
+
+ dtupdated = dtupdated.replace(hour=12, minute=0,\
+ second=0, tzinfo=pytz.utc)
+ if dtupdated.year > utcnow.year:
+ dtupdated = dtupdated.replace(year=utcnow.year)
+ pupdated = dtupdated
+
+ print("\t<entry>")
+ print("\t\t<id>%s</id>" % (plink))
+ print("\t\t<title><![CDATA[%s]]></title>" % (ptitle))
+ print("\t\t<link href=\"%s\" />" % (plink))
+ print("\t\t<author><name>%s</name></author>" % (pauthor))
+ print("\t\t<updated>%s</updated>" % (pupdated.isoformat()))
+ print("\t\t<content><![CDATA[%s]]></content>" % (pcontent))
+ print("\t</entry>")
+
+ print("</feed>")
+
+ return 0
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv))
+