selenium_crawl_tsv.py - brcon2023-hackathons - Bitreichcon 2023 Hackathon Repository
(HTM) git clone git://bitreich.org/brcon2023-hackathons git://enlrupgkhuxnvlhsf6lc3fziv5h2hhfrinws65d7roiv6bfj7d652fid.onion/brcon2023-hackathons
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) Tags
---
selenium_crawl_tsv.py (3006B)
---
1 from selenium import webdriver
2 from selenium.webdriver.common.by import By
3
4 from selenium.webdriver.firefox.options import Options
5 from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
6
7 import sys
8 from datetime import datetime
9
10 def make_escape_content_trans():
11 m = {}
12 for i in range(0, 32):
13 m[i] = ""
14 m[0x7f] = "" # DEL
15 # replace
16 m["\\"] = "\\\\"
17 m["\n"] = "\\n"
18 m["\t"] = "\\t"
19
20 return str.maketrans(m)
21
22 def make_escape_field_trans():
23 m = {}
24 for i in range(0, 32):
25 m[i] = ""
26 m[0x7f] = "" # DEL
27 # replace
28 m["\n"] = " "
29 m["\t"] = " "
30
31 return str.maketrans(m)
32
33 escape_content_tbl = make_escape_content_trans()
34 escape_field_tbl = make_escape_field_trans()
35
36 def escape_content(s):
37 return s.translate(escape_content_tbl).strip()
38
39 def escape_field(s):
40 return s.translate(escape_field_tbl).strip()
41
42 if len(sys.argv) > 1:
43 url = sys.argv[1]
44 else:
45 print("usage: <url>")
46 sys.exit(1)
47
48 options = Options()
49 options.add_argument("--headless")
50
51 # use existing profile:
52
53 #options.add_argument("--profile")
54 #profile_path = "/home/hiltjo/.mozilla/firefox/z86g7oxr.default-release"
55 # NOTE: must not be running at the same time.
56 #options.add_argument(profile_path)
57 #options.set_preference("profile", profile_path)
58
59 # setup custom profile:
60 # JS disabled
61 options.set_preference("javascript.enabled", False)
62 # disable stylesheet
63 options.set_preference("permissions.default.stylesheet", 2)
64 # disable image loading
65 options.set_preference("permissions.default.image", 2)
66 # override user-agent.
67 #options.set_preference("general.useragent.override", "whatever you want")
68
69 driver = webdriver.Firefox(options=options)
70
71 # set timeouts
72 #driver.implicitly_wait(10)
73
74 # get the page
75 driver.get(url)
76
77 # print page title
78 #print(driver.title)
79
80 #pagesource = driver.execute_script("return document.body.InnerHTML;")
81 #print(pagesource)
82 #print(driver.page_source)
83 #outer_html = driver.find_element(By.XPATH, "//body").get_attribute("outerHTML")
84
85 #outer_html = driver.find_element(By.TAG_NAME, "html").get_attribute("outerHTML")
86 #print(outer_html)
87
88 # show all links on a page
89 #links = driver.find_elements(By.TAG_NAME, "a")
90 anchors = driver.find_elements(By.CSS_SELECTOR, "main a")
91 links = []
92 for anchor in anchors:
93 href = anchor.get_attribute("href")
94 text = anchor.text
95 if len(href):
96 links.append({"href": href, "text": text})
97
98 for link in links:
99 driver.get(link["href"])
100
101 # parse timestamp.
102 time = driver.find_element(By.TAG_NAME, "time")
103 ts = datetime.strptime(time.text, "%Y-%m-%d")
104 ts = int(ts.timestamp())
105
106 content = driver.find_element(By.CSS_SELECTOR, "article").get_attribute("outerHTML")
107 title = driver.title
108 title = title.replace(" - Codemadness", "")
109
110 # escape fields
111 content = escape_content(content)
112 title = escape_field(title)
113 link = escape_field(link["href"])
114
115 print("%d\t%s\t%s\t%s\thtml" % (ts, title, link, content))
116
117 driver.close()
118 driver.quit()