add tscrape_update, tscrape_html format program, update Makefile - tscrape - twitter scraper
 (HTM) git clone git://git.codemadness.org/tscrape
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
 (DIR) commit 155b8a4fb6cbfe358721d3604bcd4526993f7897
 (DIR) parent 7bdeb05e31e28c4cfaf385dffa48ea80aa476315
 (HTM) Author: Hiltjo Posthuma <hiltjo@codemadness.org>
       Date:   Sat, 12 Aug 2017 17:47:20 +0200
       
       add tscrape_update, tscrape_html format program, update Makefile
       
       similar to sfeed
       
       Diffstat:
         M Makefile                            |      78 ++++++++++++++++++++++---------
         M tscrape_html.c                      |      22 +++++++++++++++-------
         M tscrape_plain.c                     |       3 ++-
         A tscrape_update                      |     109 +++++++++++++++++++++++++++++++
       
       4 files changed, 181 insertions(+), 31 deletions(-)
       ---
 (DIR) diff --git a/Makefile b/Makefile
       @@ -6,6 +6,8 @@ BIN = \
                tscrape\
                tscrape_html\
                tscrape_plain
       +SCRIPTS = \
       +        tscrape_update
        
        SRC = ${BIN:=.c}
        
       @@ -23,19 +25,30 @@ LIBXMLOBJ = ${LIBXMLSRC:.c=.o}
        
        LIB = ${LIBUTIL} ${LIBXML}
        
       -MAN1 = ${BIN:=.1}
       +MAN1 = 
        
       +# TODO
       +#${BIN:=.1}\
       +#${SCRIPTS:=.1}
       +
       +MAN5 = 
       +# TODO
       +#\
       +#        tscrape.5\
       +#        tscraperc.5
        DOC = \
                LICENSE\
       -        README
       +        README\
       +        TODO
        HDR = \
       +        util.h\
                xml.h
        
        all: $(BIN)
        
        ${BIN}: ${LIB} ${@:=.o}
        
       -OBJ = ${SRC:.c=.o} ${LIBUTILOBJ} ${LIBXMLOBJ}
       +OBJ = ${SRC:.c=.o} ${LIBXMLOBJ} ${LIBUTILOBJ}
        
        ${OBJ}: config.mk ${HDR}
        
       @@ -43,7 +56,7 @@ ${OBJ}: config.mk ${HDR}
                ${CC} ${LDFLAGS} -o $@ $< ${LIB}
        
        .c.o:
       -        ${CC} -c ${CFLAGS} ${CPPFLAGS} -o $@ -c $<
       +        ${CC} ${CFLAGS} ${CPPFLAGS} -o $@ -c $<
        
        ${LIBUTIL}: ${LIBUTILOBJ}
                ${AR} rc $@ $?
       @@ -53,35 +66,54 @@ ${LIBXML}: ${LIBXMLOBJ}
                ${AR} rc $@ $?
                ${RANLIB} $@
        
       -dist: $(BIN)
       -        rm -rf release/${VERSION}
       -        mkdir -p release/${VERSION}
       -        cp -f ${MAN1} ${DOC} ${HDR} \
       -                ${SRC} ${LIBXMLSRC} ${LIBUTILSRC} \
       +dist:
       +        rm -rf "${NAME}-${VERSION}"
       +        mkdir -p "${NAME}-${VERSION}"
       +        cp -f ${MAN1} ${MAN5} ${DOC} ${HDR} \
       +                ${SRC} ${LIBXMLSRC} ${LIBUTILSRC} ${SCRIPTS} \
                        Makefile config.mk \
       -                release/${VERSION}/
       +                tscraperc.example style.css \
       +                "${NAME}-${VERSION}"
                # make tarball
       -        rm -f tscrape-${VERSION}.tar.gz
       -        (cd release/${VERSION}; \
       -        tar -czf ../../tscrape-${VERSION}.tar.gz .)
       +        tar -cf - "${NAME}-${VERSION}" | \
       +                gzip -c > "${NAME}-${VERSION}.tar.gz"
       +        rm -rf "${NAME}-${VERSION}"
        
        clean:
                rm -f ${BIN} ${OBJ} ${LIB}
        
        install: all
       -        # installing executable files.
       -        mkdir -p ${DESTDIR}${PREFIX}/bin
       -        cp -f ${BIN} ${SCRIPTS} ${DESTDIR}${PREFIX}/bin
       -        for f in $(BIN); do chmod 755 ${DESTDIR}${PREFIX}/bin/$$f; done
       +        # installing executable files and scripts.
       +        mkdir -p "${DESTDIR}${PREFIX}/bin"
       +        cp -f ${BIN} ${SCRIPTS} "${DESTDIR}${PREFIX}/bin"
       +        for f in $(BIN) $(SCRIPTS); do chmod 755 "${DESTDIR}${PREFIX}/bin/$$f"; done
       +        # installing example files.
       +        mkdir -p "${DESTDIR}${PREFIX}/share/${NAME}"
       +        cp -f tscraperc.example\
       +                style.css\
       +                README\
       +                "${DESTDIR}${PREFIX}/share/${NAME}"
                # installing manual pages for tools.
       -        mkdir -p ${DESTDIR}${MANPREFIX}/man1
       -        cp -f ${MAN1} ${DESTDIR}${MANPREFIX}/man1
       -        for m in $(MAN1); do chmod 644 ${DESTDIR}${MANPREFIX}/man1/$$m; done
       +# TODO
       +#        mkdir -p "${DESTDIR}${MANPREFIX}/man1"
       +#        cp -f ${MAN1} "${DESTDIR}${MANPREFIX}/man1"
       +#        for m in $(MAN1); do chmod 644 "${DESTDIR}${MANPREFIX}/man1/$$m"; done
       +#        # installing manual pages for tscraperc(5).
       +#        mkdir -p "${DESTDIR}${MANPREFIX}/man5"
       +#        cp -f ${MAN5} "${DESTDIR}${MANPREFIX}/man5"
       +#        for m in $(MAN5); do chmod 644 "${DESTDIR}${MANPREFIX}/man5/$$m"; done
        
        uninstall:
       -        # removing executable files.
       -        for f in $(BIN); do rm -f ${DESTDIR}${PREFIX}/bin/$$f; done
       +        # removing executable files and scripts.
       +        for f in $(BIN) $(SCRIPTS); do rm -f "${DESTDIR}${PREFIX}/bin/$$f"; done
       +        # removing example files.
       +        rm -f \
       +                "${DESTDIR}${PREFIX}/share/${NAME}/tscraperc.example"\
       +                "${DESTDIR}${PREFIX}/share/${NAME}/style.css"\
       +                "${DESTDIR}${PREFIX}/share/${NAME}/README"
       +        -rmdir "${DESTDIR}${PREFIX}/share/${NAME}"
                # removing manual pages.
       -        for m in $(MAN1); do rm -f ${DESTDIR}${MANPREFIX}/man1/$$m; done
       +        for m in $(MAN1); do rm -f "${DESTDIR}${MANPREFIX}/man1/$$m"; done
       +        for m in $(MAN5); do rm -f "${DESTDIR}${MANPREFIX}/man5/$$m"; done
        
        .PHONY: all clean dist install uninstall
 (DIR) diff --git a/tscrape_html.c b/tscrape_html.c
       @@ -53,8 +53,23 @@ printfeed(FILE *fp, struct feed *f)
                        fprintf(stdout, "%04d-%02d-%02d&nbsp;%02d:%02d ",
                                tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
                                tm->tm_hour, tm->tm_min);
       +
                        if (isnew)
                                fputs("<b><u>", stdout);
       +
       +                if (fields[FieldRetweetid][0]) {
       +                        fputs("<a href=\"https://mobile.twitter.com/", stdout);
       +                        xmlencode(fields[FieldItemUsername], stdout);
       +                        fputs("/status/", stdout);
       +                        xmlencode(fields[FieldRetweetid], stdout);
       +                        fputs("\">retweeted</a>", stdout);
       +                        fputs(" <a href=\"https://mobile.twitter.com/", stdout);
       +                        xmlencode(fields[FieldItemUsername], stdout);
       +                        fputs("\">@", stdout);
       +                        xmlencode(fields[FieldItemUsername], stdout);
       +                        fputs("</a> ", stdout);
       +                }
       +
                        if (islink) {
                                fputs("<a href=\"https://mobile.twitter.com/", stdout);
                                xmlencode(fields[FieldUsername], stdout);
       @@ -68,13 +83,6 @@ printfeed(FILE *fp, struct feed *f)
                        if (isnew)
                                fputs("</u></b>", stdout);
        
       -                if (fields[FieldRetweetid][0]) {
       -                        printf(" <a href=\"https://mobile.twitter.com/");
       -                        xmlencode(fields[FieldItemUsername], stdout);
       -                        fputs("/status/", stdout);
       -                        xmlencode(fields[FieldRetweetid], stdout);
       -                        fputs("\">[retweet]</a>", stdout);
       -                }
                        fputs("\n", stdout);
                }
        }
 (DIR) diff --git a/tscrape_plain.c b/tscrape_plain.c
       @@ -48,7 +48,8 @@ printfeed(FILE *fp, const char *feedname)
                                tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
                                tm->tm_hour, tm->tm_min);
        
       -                printutf8pad(stdout, fields[FieldFullname], 25, ' ');
       +                printutf8pad(stdout, fields[FieldItemFullname], 25, ' ');
       +                fputs("  ", stdout);
                        printescape(fields[FieldText]);
                        putchar('\n');
                }
 (DIR) diff --git a/tscrape_update b/tscrape_update
       @@ -0,0 +1,109 @@
       +#!/bin/sh
       +# update feeds, merge with old feeds.
       +# NOTE: assumes "tscrape_*" executables are in $PATH.
       +
       +# defaults
       +tscrapepath="$HOME/.tscrape/feeds"
       +
       +# load config (evaluate shellscript).
       +# loadconfig(configfile)
       +loadconfig() {
       +        # allow to specify config via argv[1].
       +        if [ ! x"$1" = x"" ]; then
       +                # get absolute path of config file.
       +                config=$(readlink -f "$1")
       +        else
       +                # default config location.
       +                config="$HOME/.tscrape/tscraperc"
       +        fi
       +
       +        # load config: config is loaded here to be able to override $tscrapepath
       +        # or functions.
       +        if [ -r "${config}" ]; then
       +                . "${config}"
       +        else
       +                echo "Configuration file \"${config}\" does not exist or is not readable." >&2
       +                echo "See tscraperc.example for an example." >&2
       +                exit 1
       +        fi
       +}
       +
       +# merge raw files.
       +# merge(oldfile, newfile)
       +merge() {
       +        # unique sort by id, retweetid.
       +        # order by timestamp (desc).
       +        (sort -t '        ' -u -k5,5 -k8,8 "$1" "$2" 2>/dev/null) |
       +        sort -t '        ' -k1rn,1
       +}
       +
       +# fetch a feed via HTTP/HTTPS etc.
       +# fetchfeed(url, name, feedfile)
       +fetchfeed() {
       +        if curl --http1.0 -H 'User-Agent:' -f -s -S --max-time 15 -z "$3" "$1"; then
       +                printf "  OK %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2
       +        else
       +                printf "FAIL %s %s\n" "$(date +'%H:%M:%S')" "$2" >&2
       +        fi
       +}
       +
       +# fetch and parse feed.
       +# feed(name, feedurl, [basesiteurl], [encoding])
       +feed() {
       +        (name="$1"
       +        tmpfeedfile="${tscrapetmpdir}/${name}"
       +        tmpencfile=""
       +        encoding="$4"
       +        tscrapefile="${tscrapepath}/$1"
       +
       +        fetchfeed "$2" "$1" "${tscrapefile}" | tscrape "$3" > "${tmpfeedfile}"
       +
       +        # get new data and merge with old.
       +        tscrapefilenew="${tscrapepath}/${name}.new"
       +        # new feed data is non-empty.
       +        if [ -s "${tmpfeedfile}" ]; then
       +                # if file exists, merge
       +                if [ -e "${tscrapefile}" ]; then
       +                        merge "${tscrapefile}" "${tmpfeedfile}" > "${tscrapefilenew}"
       +                        # overwrite old file with updated file
       +                        mv "${tscrapefilenew}" "${tscrapefile}"
       +                else
       +                        merge "/dev/null" "${tmpfeedfile}" > "${tscrapefile}"
       +                fi
       +        fi) &
       +}
       +
       +terminated() {
       +        isrunning="0"
       +}
       +
       +cleanup() {
       +        # remove temporary files
       +        rm -rf "${tscrapetmpdir}"
       +}
       +
       +feeds() {
       +        echo "Configuration file \"${config}\" is invalid or does not contain a \"feeds\" function." >&2
       +        echo "See tscraperc.example for an example." >&2
       +}
       +
       +# load config file.
       +loadconfig "$1"
       +# fetch feeds and store in temporary file.
       +tscrapetmpdir="$(mktemp -d '/tmp/tscrape_XXXXXX')"
       +# kill whole current process group on ^C.
       +isrunning="1"
       +# SIGTERM: signal to terminate parent.
       +trap -- "terminated" "15"
       +# SIGINT: kill all running childs >:D
       +trap -- "kill -TERM -$$" "2"
       +# make sure path exists.
       +mkdir -p "${tscrapepath}"
       +# fetch feeds specified in config file.
       +feeds
       +# wait till all feeds are fetched (concurrently).
       +wait
       +# cleanup temporary files etc.
       +cleanup
       +# if terminated.
       +[ "${isrunning}" = "0" ] && exit 1