add tscrape_update, tscrape_html format program, update Makefile - tscrape - twitter scraper
(HTM) git clone git://git.codemadness.org/tscrape
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
(DIR) commit 155b8a4fb6cbfe358721d3604bcd4526993f7897
(DIR) parent 7bdeb05e31e28c4cfaf385dffa48ea80aa476315
(HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
Date: Sat, 12 Aug 2017 17:47:20 +0200
add tscrape_update, tscrape_html format program, update Makefile
similar to sfeed
Diffstat:
M Makefile | 78 ++++++++++++++++++++++---------
M tscrape_html.c | 22 +++++++++++++++-------
M tscrape_plain.c | 3 ++-
A tscrape_update | 109 +++++++++++++++++++++++++++++++
4 files changed, 181 insertions(+), 31 deletions(-)
---
(DIR) diff --git a/Makefile b/Makefile
@@ -6,6 +6,8 @@ BIN = \
tscrape\
tscrape_html\
tscrape_plain
+SCRIPTS = \
+ tscrape_update
SRC = ${BIN:=.c}
@@ -23,19 +25,30 @@ LIBXMLOBJ = ${LIBXMLSRC:.c=.o}
LIB = ${LIBUTIL} ${LIBXML}
-MAN1 = ${BIN:=.1}
+MAN1 =
+# TODO
+#${BIN:=.1}\
+#${SCRIPTS:=.1}
+
+MAN5 =
+# TODO
+#\
+# tscrape.5\
+# tscraperc.5
DOC = \
LICENSE\
- README
+ README\
+ TODO
HDR = \
+ util.h\
xml.h
all: $(BIN)
${BIN}: ${LIB} ${@:=.o}
-OBJ = ${SRC:.c=.o} ${LIBUTILOBJ} ${LIBXMLOBJ}
+OBJ = ${SRC:.c=.o} ${LIBXMLOBJ} ${LIBUTILOBJ}
${OBJ}: config.mk ${HDR}
@@ -43,7 +56,7 @@ ${OBJ}: config.mk ${HDR}
${CC} ${LDFLAGS} -o $@ $< ${LIB}
.c.o:
- ${CC} -c ${CFLAGS} ${CPPFLAGS} -o $@ -c $<
+ ${CC} ${CFLAGS} ${CPPFLAGS} -o $@ -c $<
${LIBUTIL}: ${LIBUTILOBJ}
${AR} rc $@ $?
@@ -53,35 +66,54 @@ ${LIBXML}: ${LIBXMLOBJ}
${AR} rc $@ $?
${RANLIB} $@
-dist: $(BIN)
- rm -rf release/${VERSION}
- mkdir -p release/${VERSION}
- cp -f ${MAN1} ${DOC} ${HDR} \
- ${SRC} ${LIBXMLSRC} ${LIBUTILSRC} \
+dist:
+ rm -rf "${NAME}-${VERSION}"
+ mkdir -p "${NAME}-${VERSION}"
+ cp -f ${MAN1} ${MAN5} ${DOC} ${HDR} \
+ ${SRC} ${LIBXMLSRC} ${LIBUTILSRC} ${SCRIPTS} \
Makefile config.mk \
- release/${VERSION}/
+ tscraperc.example style.css \
+ "${NAME}-${VERSION}"
# make tarball
- rm -f tscrape-${VERSION}.tar.gz
- (cd release/${VERSION}; \
- tar -czf ../../tscrape-${VERSION}.tar.gz .)
+ tar -cf - "${NAME}-${VERSION}" | \
+ gzip -c > "${NAME}-${VERSION}.tar.gz"
+ rm -rf "${NAME}-${VERSION}"
clean:
rm -f ${BIN} ${OBJ} ${LIB}
install: all
- # installing executable files.
- mkdir -p ${DESTDIR}${PREFIX}/bin
- cp -f ${BIN} ${SCRIPTS} ${DESTDIR}${PREFIX}/bin
- for f in $(BIN); do chmod 755 ${DESTDIR}${PREFIX}/bin/$$f; done
+ # installing executable files and scripts.
+ mkdir -p "${DESTDIR}${PREFIX}/bin"
+ cp -f ${BIN} ${SCRIPTS} "${DESTDIR}${PREFIX}/bin"
+ for f in $(BIN) $(SCRIPTS); do chmod 755 "${DESTDIR}${PREFIX}/bin/$$f"; done
+ # installing example files.
+ mkdir -p "${DESTDIR}${PREFIX}/share/${NAME}"
+ cp -f tscraperc.example\
+ style.css\
+ README\
+ "${DESTDIR}${PREFIX}/share/${NAME}"
# installing manual pages for tools.
- mkdir -p ${DESTDIR}${MANPREFIX}/man1
- cp -f ${MAN1} ${DESTDIR}${MANPREFIX}/man1
- for m in $(MAN1); do chmod 644 ${DESTDIR}${MANPREFIX}/man1/$$m; done
+# TODO
+# mkdir -p "${DESTDIR}${MANPREFIX}/man1"
+# cp -f ${MAN1} "${DESTDIR}${MANPREFIX}/man1"
+# for m in $(MAN1); do chmod 644 "${DESTDIR}${MANPREFIX}/man1/$$m"; done
+# # installing manual pages for tscraperc(5).
+# mkdir -p "${DESTDIR}${MANPREFIX}/man5"
+# cp -f ${MAN5} "${DESTDIR}${MANPREFIX}/man5"
+# for m in $(MAN5); do chmod 644 "${DESTDIR}${MANPREFIX}/man5/$$m"; done
uninstall:
- # removing executable files.
- for f in $(BIN); do rm -f ${DESTDIR}${PREFIX}/bin/$$f; done
+ # removing executable files and scripts.
+ for f in $(BIN) $(SCRIPTS); do rm -f "${DESTDIR}${PREFIX}/bin/$$f"; done
+ # removing example files.
+ rm -f \
+ "${DESTDIR}${PREFIX}/share/${NAME}/tscraperc.example"\
+ "${DESTDIR}${PREFIX}/share/${NAME}/style.css"\
+ "${DESTDIR}${PREFIX}/share/${NAME}/README"
+ -rmdir "${DESTDIR}${PREFIX}/share/${NAME}"
# removing manual pages.
- for m in $(MAN1); do rm -f ${DESTDIR}${MANPREFIX}/man1/$$m; done
+ for m in $(MAN1); do rm -f "${DESTDIR}${MANPREFIX}/man1/$$m"; done
+ for m in $(MAN5); do rm -f "${DESTDIR}${MANPREFIX}/man5/$$m"; done
.PHONY: all clean dist install uninstall
(DIR) diff --git a/tscrape_html.c b/tscrape_html.c
@@ -53,8 +53,23 @@ printfeed(FILE *fp, struct feed *f)
fprintf(stdout, "%04d-%02d-%02d %02d:%02d ",
tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
tm->tm_hour, tm->tm_min);
+
if (isnew)
fputs("<b><u>", stdout);
+
+ if (fields[FieldRetweetid][0]) {
+ fputs("<a href=\"https://mobile.twitter.com/", stdout);
+ xmlencode(fields[FieldItemUsername], stdout);
+ fputs("/status/", stdout);
+ xmlencode(fields[FieldRetweetid], stdout);
+ fputs("\">retweeted</a>", stdout);
+ fputs(" <a href=\"https://mobile.twitter.com/", stdout);
+ xmlencode(fields[FieldItemUsername], stdout);
+ fputs("\">@", stdout);
+ xmlencode(fields[FieldItemUsername], stdout);
+ fputs("</a> ", stdout);
+ }
+
if (islink) {
fputs("<a href=\"https://mobile.twitter.com/", stdout);
xmlencode(fields[FieldUsername], stdout);
@@ -68,13 +83,6 @@ printfeed(FILE *fp, struct feed *f)
if (isnew)
fputs("</u></b>", stdout);
- if (fields[FieldRetweetid][0]) {
- printf(" <a href=\"https://mobile.twitter.com/");
- xmlencode(fields[FieldItemUsername], stdout);
- fputs("/status/", stdout);
- xmlencode(fields[FieldRetweetid], stdout);
- fputs("\">[retweet]</a>", stdout);
- }
fputs("\n", stdout);
}
}
(DIR) diff --git a/tscrape_plain.c b/tscrape_plain.c
@@ -48,7 +48,8 @@ printfeed(FILE *fp, const char *feedname)
tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
tm->tm_hour, tm->tm_min);
- printutf8pad(stdout, fields[FieldFullname], 25, ' ');
+ printutf8pad(stdout, fields[FieldItemFullname], 25, ' ');
+ fputs(" ", stdout);
printescape(fields[FieldText]);
putchar('\n');
}
(DIR) diff --git a/tscrape_update b/tscrape_update
@@ -0,0 +1,109 @@
+#!/bin/sh
+# update feeds, merge with old feeds.
+# NOTE: assumes "tscrape_*" executables are in $PATH.
+
+# defaults
+tscrapepath="$HOME/.tscrape/feeds"
+
+# load config (evaluate shellscript).
+# loadconfig(configfile)
+loadconfig() {
+ # allow to specify config via argv[1].
+ if [ ! x"$1" = x"" ]; then
+ # get absolute path of config file.
+ config=$(readlink -f "$1")
+ else
+ # default config location.
+ config="$HOME/.tscrape/tscraperc"
+ fi
+
+ # load config: config is loaded here to be able to override $tscrapepath
+ # or functions.
+ if [ -r "${config}" ]; then
+ . "${config}"
+ else
+ echo "Configuration file \"${config}\" does not exist or is not readable." >&2
+ echo "See tscraperc.example for an example." >&2
+ exit 1
+ fi
+}
+
+# merge raw files.
+# merge(oldfile, newfile)
+merge() {
+ # unique sort by id, retweetid.
+ # order by timestamp (desc).
+ (sort -t ' ' -u -k5,5 -k8,8 "$1" "$2" 2>/dev/null) |
+ sort -t ' ' -k1rn,1
+}
+
+# fetch a feed via HTTP/HTTPS etc.
+# fetchfeed(url, name, feedfile)
+fetchfeed() {
+ if curl --http1.0 -H 'User-Agent:' -f -s -S --max-time 15 -z "$3" "$1"; then
+ printf " OK %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2
+ else
+ printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2
+ fi
+}
+
+# fetch and parse feed.
+# feed(name, feedurl, [basesiteurl], [encoding])
+feed() {
+ (name="$1"
+ tmpfeedfile="${tscrapetmpdir}/${name}"
+ tmpencfile=""
+ encoding="$4"
+ tscrapefile="${tscrapepath}/$1"
+
+ fetchfeed "$2" "$1" "${tscrapefile}" | tscrape "$3" > "${tmpfeedfile}"
+
+ # get new data and merge with old.
+ tscrapefilenew="${tscrapepath}/${name}.new"
+ # new feed data is non-empty.
+ if [ -s "${tmpfeedfile}" ]; then
+ # if file exists, merge
+ if [ -e "${tscrapefile}" ]; then
+ merge "${tscrapefile}" "${tmpfeedfile}" > "${tscrapefilenew}"
+ # overwrite old file with updated file
+ mv "${tscrapefilenew}" "${tscrapefile}"
+ else
+ merge "/dev/null" "${tmpfeedfile}" > "${tscrapefile}"
+ fi
+ fi) &
+}
+
+terminated() {
+ isrunning="0"
+}
+
+cleanup() {
+ # remove temporary files
+ rm -rf "${tscrapetmpdir}"
+}
+
+feeds() {
+ echo "Configuration file \"${config}\" is invalid or does not contain a \"feeds\" function." >&2
+ echo "See tscraperc.example for an example." >&2
+}
+
+# load config file.
+loadconfig "$1"
+# fetch feeds and store in temporary file.
+tscrapetmpdir="$(mktemp -d '/tmp/tscrape_XXXXXX')"
+# kill whole current process group on ^C.
+isrunning="1"
+# SIGTERM: signal to terminate parent.
+trap -- "terminated" "15"
+# SIGINT: kill all running childs >:D
+trap -- "kill -TERM -$$" "2"
+# make sure path exists.
+mkdir -p "${tscrapepath}"
+# fetch feeds specified in config file.
+feeds
+# wait till all feeds are fetched (concurrently).
+wait
+# cleanup temporary files etc.
+cleanup
+# if terminated.
+[ "${isrunning}" = "0" ] && exit 1