Add selenium to tsv example from bob. - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repository
(HTM) git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/brcon2023-hackathons
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) Tags
---
(DIR) commit d2f3f8bf36e6d7b0d88f7d3e02353bcd87a93795
(DIR) parent a7cd0c547c792f74b7784cc0a8c806380a28ca2f
(HTM) Author: Christoph Lohmann <20h@r-36.net>
Date: Thu, 10 Aug 2023 16:14:57 +0200
Add selenium to tsv example from bob.
Diffstat:
A sfeed-atom/selenium_crawl_tsv.py | 118 +++++++++++++++++++++++++++++++
1 file changed, 118 insertions(+), 0 deletions(-)
---
(DIR) diff --git a/sfeed-atom/selenium_crawl_tsv.py b/sfeed-atom/selenium_crawl_tsv.py
@@ -0,0 +1,118 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
+
+import sys
+from datetime import datetime
+
+def make_escape_content_trans():
+ m = {}
+ for i in range(0, 32):
+ m[i] = ""
+ m[0x7f] = "" # DEL
+ # replace
+ m["\\"] = "\\\\"
+ m["\n"] = "\\n"
+ m["\t"] = "\\t"
+
+ return str.maketrans(m)
+
+def make_escape_field_trans():
+ m = {}
+ for i in range(0, 32):
+ m[i] = ""
+ m[0x7f] = "" # DEL
+ # replace
+ m["\n"] = " "
+ m["\t"] = " "
+
+ return str.maketrans(m)
+
+escape_content_tbl = make_escape_content_trans()
+escape_field_tbl = make_escape_field_trans()
+
+def escape_content(s):
+ return s.translate(escape_content_tbl).strip()
+
+def escape_field(s):
+ return s.translate(escape_field_tbl).strip()
+
+if len(sys.argv) > 1:
+ url = sys.argv[1]
+else:
+ print("usage: <url>")
+ sys.exit(1)
+
+options = Options()
+options.add_argument("--headless")
+
+# use existing profile:
+
+#options.add_argument("--profile")
+#profile_path = "/home/hiltjo/.mozilla/firefox/z86g7oxr.default-release"
+# NOTE: must not be running at the same time.
+#options.add_argument(profile_path)
+#options.set_preference("profile", profile_path)
+
+# setup custom profile:
+# JS disabled
+options.set_preference("javascript.enabled", False)
+# disable stylesheet
+options.set_preference("permissions.default.stylesheet", 2)
+# disable image loading
+options.set_preference("permissions.default.image", 2)
+# override user-agent.
+#options.set_preference("general.useragent.override", "whatever you want")
+
+driver = webdriver.Firefox(options=options)
+
+# set timeouts
+#driver.implicitly_wait(10)
+
+# get the page
+driver.get(url)
+
+# print page title
+#print(driver.title)
+
+#pagesource = driver.execute_script("return document.body.InnerHTML;")
+#print(pagesource)
+#print(driver.page_source)
+#outer_html = driver.find_element(By.XPATH, "//body").get_attribute("outerHTML")
+
+#outer_html = driver.find_element(By.TAG_NAME, "html").get_attribute("outerHTML")
+#print(outer_html)
+
+# show all links on a page
+#links = driver.find_elements(By.TAG_NAME, "a")
+anchors = driver.find_elements(By.CSS_SELECTOR, "main a")
+links = []
+for anchor in anchors:
+ href = anchor.get_attribute("href")
+ text = anchor.text
+ if len(href):
+ links.append({"href": href, "text": text})
+
+for link in links:
+ driver.get(link["href"])
+
+ # parse timestamp.
+ time = driver.find_element(By.TAG_NAME, "time")
+ ts = datetime.strptime(time.text, "%Y-%m-%d")
+ ts = int(ts.timestamp())
+
+ content = driver.find_element(By.CSS_SELECTOR, "article").get_attribute("outerHTML")
+ title = driver.title
+ title = title.replace(" - Codemadness", "")
+
+ # escape fields
+ content = escape_content(content)
+ title = escape_field(title)
+ link = escape_field(link["href"])
+
+ print("%d\t%s\t%s\t%s\thtml" % (ts, title, link, content))
+
+driver.close()
+driver.quit()