article.cgi - gophercgis - Collection of gopher CGI/DCGI for geomyidae
(HTM) hg clone https://bitbucket.org/iamleot/gophercgis
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
article.cgi
---
1 #!/bin/sh
2
3 . ../common/config.sh
4 . ../common/html.sh
5
6 #
7 # Extract article content
8 #
9 extract_article()
10 {
11
12 awk '
13 / <div class="article-header"/,/ <\/div>/ {
14 # Remove the article section
15 gsub(/<span itemprop="articleSection" .*<\/a><\/span>/, "")
16
17 print
18 }
19
20 /<div class="entry-content" itemprop="articleBody">/,/<div class="entry-tools">/ {
21 # Remove all img-s
22 gsub(/<img [^>]+\/?>/, "")
23
24 print
25 }
26
27 /class="readoffline-shortcode"/ {
28 match($0, /class="pdf" rel="nofollow" href="[^"]+"/)
29 printf("<pre>")
30 printf("PDF: <%s>\n", substr($0, RSTART + 33, RLENGTH - 34))
31 match($0, /class="epub" rel="nofollow" href="[^"]+"/)
32 printf("EPUB: <%s>\n", substr($0, RSTART + 34, RLENGTH - 35))
33 match($0, /class="mobi" rel="nofollow" href="[^"]+"/)
34 printf("MOBI: <%s>\n", substr($0, RSTART + 34, RLENGTH - 35))
35 printf("</pre>")
36 }
37 '
38 }
39
40
41 url="$2"
42 curl -A Googlebot-News -Lgs -- "${url}" |
43 extract_article |
44 html_to_text
45
46 echo ""
47 echo "URL: <${url}>"