Check-in by ben on 2025-04-13 00:05:13 Add feature to block by IP address. Block bad bots that are ignoring gopher://tilde.pink/0/robots.txt INSERTED DELETED 1 0 config.m4 1 0 readme.txt 4 2 src/cache.awk 10 1 src/cgi.awk 1 0 src/config.awk 17 3 TOTAL over 5 changed files Index: config.m4 ================================================================== --- config.m4 +++ config.m4 @@ -4,10 +4,11 @@ dnl define(__AGENT__, Lynx/2.9.0dev.10 libwww-FM/2.14 SSL-MM/1.4.1 OpenSSL/1.1.1w)dnl define(__API_ENDPOINT__, http://archive.org)dnl define(__API_SSL_ENDPOINT__, https://archive.org)dnl define(__AWK_EXT__, 0)dnl +define(__BLOCKLIST__, /home/user/pharos/block.txt)dnl define(__CACHE_DB__, /home/user/pharos/db/cache.dat)dnl define(__CACHE_ENABLED__, 0)dnl define(__CGIPATH__, /~user/pharos)dnl define(__CMD_AWK__, /usr/bin/awk)dnl define(__CMD_CURL__, /usr/bin/curl)dnl Index: readme.txt ================================================================== --- readme.txt +++ readme.txt @@ -33,10 +33,11 @@ Configuration ============= To set configuration variables, edit config.m4 AWK_EXT allows use of non-standard features in gawk and mawk +BLOCKLIST is a file to block by IP address, one regex per line CACHE_ENABLED caches content in sqlite to reduce API calls Installation ============ Installation depends on m4. Index: src/cache.awk ================================================================== --- src/cache.awk +++ src/cache.awk @@ -27,10 +27,11 @@ return } cache_signature = signature sql = "CREATE TABLE IF NOT EXISTS cache ( \ id TEXT PRIMARY KEY, \ + ip TEXT, \ epoch INTEGER, \ result TEXT)" sqlite_exec(cache_db, sql) retval = cache_get_value() return retval @@ -48,12 +49,13 @@ function cache_set_value(value, sql) { if (!cache_enabled) { return } - sql = sprintf("REPLACE INTO cache(id, epoch, result) \ - VALUES('%s', unixepoch(), '%s')", + sql = sprintf("REPLACE INTO cache(id, ip, epoch, result) \ + VALUES('%s', '%s', unixepoch(), '%s')", sqlite_escape(cache_signature), + ENVIRON["REMOTE_ADDR"], sqlite_escape(value)) sqlite_exec(cache_db, sql) return } Index: src/cgi.awk ================================================================== --- src/cgi.awk +++ src/cgi.awk @@ -1,6 +1,15 @@ -function cgi_init() { +function cgi_init( ip) { + ip = ENVIRON["REMOTE_ADDR"] + while ((getline < blocklist) > 0) { + # bad bot ignoring robots.txt, block by IP address + if (match(ip, $0)) { + exit 0 + } + } + close(blocklist) + search = ARGV[1] arguments = ARGV[2] traversal = ARGV[5] selector = ARGV[6] Index: src/config.awk ================================================================== --- src/config.awk +++ src/config.awk @@ -1,10 +1,11 @@ function config_init() { agent = "__AGENT__" api_endpoint = "__API_ENDPOINT__" api_ssl_endpoint = "__API_SSL_ENDPOINT__" awk_ext = __AWK_EXT__ + blocklist = "__BLOCKLIST__" cache_db = "__CACHE_DB__" cache_enabled = __CACHE_ENABLED__ cgipath = "__CGIPATH__" cmd_curl = "__CMD_CURL__" cmd_enc = "__CMD_ENV__"