sed.c - sbase - suckless unix tools
 (HTM) git clone git://git.suckless.org/sbase
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       sed.c (41896B)
       ---
            1 /* FIXME: summary
            2  * decide whether we enforce valid UTF-8, right now it's enforced in certain
            3  *     parts of the script, but not the input...
            4  * nul bytes cause explosions due to use of libc string functions. thoughts?
            5  * lack of newline at end of file, currently we add one. what should we do?
            6  * allow "\\t" for "\t" etc. in regex? in replacement text?
            7  * POSIX says don't flush on N when out of input, but GNU and busybox do.
            8  */
            9 
           10 #include <ctype.h>
           11 #include <errno.h>
           12 #include <regex.h>
           13 #include <stdlib.h>
           14 #include <string.h>
           15 
           16 #include "utf.h"
           17 #include "util.h"
           18 
           19 /* Types */
           20 
           21 /* used as queue for writes and stack for {,:,b,t */
           22 typedef struct {
           23         void **data;
           24         size_t size;
           25         size_t cap;
           26 } Vec;
           27 
           28 /* used for arbitrary growth, str is a C string
           29  * FIXME: does it make sense to keep track of length? or just rely on libc
           30  *        string functions? If we want to support nul bytes everything changes
           31  */
           32 typedef struct {
           33         char  *str;
           34         size_t cap;
           35 } String;
           36 
           37 typedef struct Cmd Cmd;
           38 typedef struct {
           39         void  (*fn)(Cmd *);
           40         char *(*getarg)(Cmd *, char *);
           41         void  (*freearg)(Cmd *);
           42         unsigned char naddr;
           43 } Fninfo;
           44 
           45 typedef struct {
           46         union {
           47                 size_t   lineno;
           48                 regex_t *re;
           49         } u;
           50         enum {
           51                 IGNORE, /* empty address, ignore        */
           52                 EVERY , /* every line                   */
           53                 LINE  , /* line number                  */
           54                 LAST  , /* last line ($)                */
           55                 REGEX , /* use included regex           */
           56                 LASTRE, /* use most recently used regex */
           57         } type;
           58 } Addr;
           59 
           60 /* DISCUSS: naddr is not strictly necessary, but very helpful
           61  * naddr == 0 iff beg.type == EVERY  && end.type == IGNORE
           62  * naddr == 1 iff beg.type != IGNORE && end.type == IGNORE
           63  * naddr == 2 iff beg.type != IGNORE && end.type != IGNORE
           64  */
           65 typedef struct {
           66         Addr          beg;
           67         Addr          end;
           68         unsigned char naddr;
           69 } Range;
           70 
           71 typedef struct {
           72         regex_t      *re; /* if NULL use last regex */
           73         String        repl;
           74         FILE         *file;
           75         size_t        occurrence; /* 0 for all (g flag) */
           76         Rune          delim;
           77         unsigned int  p:1;
           78 } Sarg;
           79 
           80 typedef struct {
           81         Rune *set1;
           82         Rune *set2;
           83 } Yarg;
           84 
           85 typedef struct {
           86         String str; /* a,c,i text. r file path */
           87         void  (*print)(char *, FILE *); /* check_puts for a, write_file for r, unused for c,i */
           88 } ACIRarg;
           89 
           90 struct Cmd {
           91         Range   range;
           92         Fninfo *fninfo;
           93         union {
           94                 Cmd      *jump;   /* used for   b,t when running  */
           95                 char     *label;  /* used for :,b,t when building */
           96                 ptrdiff_t offset; /* used for { (pointers break during realloc) */
           97                 FILE     *file;   /* used for w */
           98 
           99                 /* FIXME: Should the following be in the union? or pointers and malloc? */
          100                 Sarg      s;
          101                 Yarg      y;
          102                 ACIRarg   acir;
          103         } u; /* I find your lack of anonymous unions disturbing */
          104         unsigned int in_match:1;
          105         unsigned int negate  :1;
          106 };
          107 
          108 /* Files for w command (and s' w flag) */
          109 typedef struct {
          110         char *path;
          111         FILE *file;
          112 } Wfile;
          113 
          114 /*
          115  * Function Declarations
          116  */
          117 
          118 /* Dynamically allocated arrays and strings */
          119 static void resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next);
          120 static void *pop(Vec *v);
          121 static void push(Vec *v, void *p);
          122 static void stracat(String *dst, char *src);
          123 static void strnacat(String *dst, char *src, size_t n);
          124 static void stracpy(String *dst, char *src);
          125 
          126 /* Cleanup and errors */
          127 static void usage(void);
          128 
          129 /* Parsing functions and related utilities */
          130 static void compile(char *s, int isfile);
          131 static int read_line(FILE *f, String *s);
          132 static char *make_range(Range *range, char *s);
          133 static char *make_addr(Addr *addr, char *s);
          134 static char *find_delim(char *s, Rune delim, int do_brackets);
          135 static char *chompr(char *s, Rune rune);
          136 static char *chomp(char *s);
          137 static Rune *strtorunes(char *s, size_t nrunes);
          138 static long stol(char *s, char **endp);
          139 static size_t escapes(char *beg, char *end, Rune delim, int n_newline);
          140 static size_t echarntorune(Rune *r, char *s, size_t n);
          141 static void insert_labels(void);
          142 
          143 /* Get and Free arg and related utilities */
          144 static char *get_aci_arg(Cmd *c, char *s);
          145 static void aci_append(Cmd *c, char *s);
          146 static void free_acir_arg(Cmd *c);
          147 static char *get_bt_arg(Cmd *c, char *s);
          148 static char *get_r_arg(Cmd *c, char *s);
          149 static char *get_s_arg(Cmd *c, char *s);
          150 static void free_s_arg(Cmd *c);
          151 static char *get_w_arg(Cmd *c, char *s);
          152 static char *get_y_arg(Cmd *c, char *s);
          153 static void free_y_arg(Cmd *c);
          154 static char *get_colon_arg(Cmd *c, char *s);
          155 static char *get_lbrace_arg(Cmd *c, char *s);
          156 static char *get_rbrace_arg(Cmd *c, char *s);
          157 static char *semicolon_arg(char *s);
          158 
          159 /* Running */
          160 static void run(void);
          161 static int in_range(Cmd *c);
          162 static int match_addr(Addr *a);
          163 static int next_file(void);
          164 static int is_eof(FILE *f);
          165 static void do_writes(void);
          166 static void write_file(char *path, FILE *out);
          167 static void check_puts(char *s, FILE *f);
          168 static void update_ranges(Cmd *beg, Cmd *end);
          169 
          170 /* Sed functions */
          171 static void cmd_y(Cmd *c);
          172 static void cmd_x(Cmd *c);
          173 static void cmd_w(Cmd *c);
          174 static void cmd_t(Cmd *c);
          175 static void cmd_s(Cmd *c);
          176 static void cmd_r(Cmd *c);
          177 static void cmd_q(Cmd *c);
          178 static void cmd_P(Cmd *c);
          179 static void cmd_p(Cmd *c);
          180 static void cmd_N(Cmd *c);
          181 static void cmd_n(Cmd *c);
          182 static void cmd_l(Cmd *c);
          183 static void cmd_i(Cmd *c);
          184 static void cmd_H(Cmd *c);
          185 static void cmd_h(Cmd *c);
          186 static void cmd_G(Cmd *c);
          187 static void cmd_g(Cmd *c);
          188 static void cmd_D(Cmd *c);
          189 static void cmd_d(Cmd *c);
          190 static void cmd_c(Cmd *c);
          191 static void cmd_b(Cmd *c);
          192 static void cmd_a(Cmd *c);
          193 static void cmd_colon(Cmd *c);
          194 static void cmd_equal(Cmd *c);
          195 static void cmd_lbrace(Cmd *c);
          196 static void cmd_rbrace(Cmd *c);
          197 static void cmd_last(Cmd *c);
          198 
          199 /* Actions */
          200 static void new_line(void);
          201 static void app_line(void);
          202 static void new_next(void);
          203 static void old_next(void);
          204 
          205 /*
          206  * Globals
          207  */
          208 static Vec braces, labels, branches; /* holds ptrdiff_t. addrs of {, :, bt */
          209 static Vec writes; /* holds cmd*. writes scheduled by a and r commands */
          210 static Vec wfiles; /* holds Wfile*. files for w and s///w commands */
          211 
          212 static Cmd   *prog, *pc; /* Program, program counter */
          213 static size_t pcap;
          214 static size_t lineno;
          215 
          216 static regex_t *lastre; /* last used regex for empty regex search */
          217 static char   **files;  /* list of file names from argv */
          218 static FILE    *file;   /* current file we are reading */
          219 static int      ret;    /* exit status */
          220 
          221 static String patt, hold, genbuf;
          222 
          223 static struct {
          224         unsigned int n       :1; /* -n (no print) */
          225         unsigned int E       :1; /* -E (extended re) */
          226         unsigned int s       :1; /* s/// replacement happened */
          227         unsigned int aci_cont:1; /* a,c,i text continuation */
          228         unsigned int s_cont  :1; /* s/// replacement text continuation */
          229         unsigned int halt    :1; /* halt execution */
          230 } gflags;
          231 
          232 /* FIXME: move character inside Fninfo and only use 26*sizeof(Fninfo) instead of 127*sizeof(Fninfo) bytes */
          233 static Fninfo fns[] = {
          234         ['a'] = { cmd_a     , get_aci_arg   , free_acir_arg , 1 }, /* schedule write of text for later                                                      */
          235         ['b'] = { cmd_b     , get_bt_arg    , NULL          , 2 }, /* branch to label char *label when building, Cmd *jump when running                     */
          236         ['c'] = { cmd_c     , get_aci_arg   , free_acir_arg , 2 }, /* delete pattern space. at 0 or 1 addr or end of 2 addr, write text                     */
          237         ['d'] = { cmd_d     , NULL          , NULL          , 2 }, /* delete pattern space                                                                  */
          238         ['D'] = { cmd_D     , NULL          , NULL          , 2 }, /* delete to first newline and start new cycle without reading (if no newline, d)        */
          239         ['g'] = { cmd_g     , NULL          , NULL          , 2 }, /* replace pattern space with hold space                                                 */
          240         ['G'] = { cmd_G     , NULL          , NULL          , 2 }, /* append newline and hold space to pattern space                                        */
          241         ['h'] = { cmd_h     , NULL          , NULL          , 2 }, /* replace hold space with pattern space                                                 */
          242         ['H'] = { cmd_H     , NULL          , NULL          , 2 }, /* append newline and pattern space to hold space                                        */
          243         ['i'] = { cmd_i     , get_aci_arg   , free_acir_arg , 1 }, /* write text                                                                            */
          244         ['l'] = { cmd_l     , NULL          , NULL          , 2 }, /* write pattern space in 'visually unambiguous form'                                    */
          245         ['n'] = { cmd_n     , NULL          , NULL          , 2 }, /* write pattern space (unless -n) read to replace pattern space (if no input, quit)     */
          246         ['N'] = { cmd_N     , NULL          , NULL          , 2 }, /* append to pattern space separated by newline, line number changes (if no input, quit) */
          247         ['p'] = { cmd_p     , NULL          , NULL          , 2 }, /* write pattern space                                                                   */
          248         ['P'] = { cmd_P     , NULL          , NULL          , 2 }, /* write pattern space up to first newline                                               */
          249         ['q'] = { cmd_q     , NULL          , NULL          , 1 }, /* quit                                                                                  */
          250         ['r'] = { cmd_r     , get_r_arg     , free_acir_arg , 1 }, /* write contents of file (unable to open/read treated as empty file)                    */
          251         ['s'] = { cmd_s     , get_s_arg     , free_s_arg    , 2 }, /* find/replace/all that crazy s stuff                                                   */
          252         ['t'] = { cmd_t     , get_bt_arg    , NULL          , 2 }, /* if s/// succeeded (since input or last t) branch to label (branch to end if no label) */
          253         ['w'] = { cmd_w     , get_w_arg     , NULL          , 2 }, /* append pattern space to file                                                          */
          254         ['x'] = { cmd_x     , NULL          , NULL          , 2 }, /* exchange pattern and hold spaces                                                      */
          255         ['y'] = { cmd_y     , get_y_arg     , free_y_arg    , 2 }, /* replace runes in set1 with runes in set2                                              */
          256         [':'] = { cmd_colon , get_colon_arg , NULL          , 0 }, /* defines label for later b and t commands                                              */
          257         ['='] = { cmd_equal , NULL          , NULL          , 1 }, /* printf("%d\n", line_number);                                                          */
          258         ['{'] = { cmd_lbrace, get_lbrace_arg, NULL          , 2 }, /* if we match, run commands, otherwise jump to close                                    */
          259         ['}'] = { cmd_rbrace, get_rbrace_arg, NULL          , 0 }, /* noop, hold onto open for ease of building scripts                                     */
          260 
          261         [0x7f] = { NULL, NULL, NULL, 0 }, /* index is checked with isascii(3p). fill out rest of array */
          262 };
          263 
          264 /*
          265  * Function Definitions
          266  */
          267 
          268 /* given memory pointed to by *ptr that currently holds *nmemb members of size
          269  * size, realloc to hold new_nmemb members, return new_nmemb in *memb and one
          270  * past old end in *next. if realloc fails...explode
          271  */
          272 static void
          273 resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next)
          274 {
          275         void *n, *tmp;
          276 
          277         if (new_nmemb) {
          278                 tmp = ereallocarray(*ptr, new_nmemb, size);
          279         } else { /* turns out realloc(*ptr, 0) != free(*ptr) */
          280                 free(*ptr);
          281                 tmp = NULL;
          282         }
          283         n = (char *)tmp + *nmemb * size;
          284         *nmemb = new_nmemb;
          285         *ptr   = tmp;
          286         if (next)
          287                 *next = n;
          288 }
          289 
          290 static void *
          291 pop(Vec *v)
          292 {
          293         if (!v->size)
          294                 return NULL;
          295         return v->data[--v->size];
          296 }
          297 
          298 static void
          299 push(Vec *v, void *p)
          300 {
          301         if (v->size == v->cap)
          302                 resize((void **)&v->data, &v->cap, sizeof(*v->data), v->cap * 2 + 1, NULL);
          303         v->data[v->size++] = p;
          304 }
          305 
          306 static void
          307 stracat(String *dst, char *src)
          308 {
          309         int new = !dst->cap;
          310         size_t len;
          311 
          312         len = (new ? 0 : strlen(dst->str)) + strlen(src) + 1;
          313         if (dst->cap < len)
          314                 resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
          315         if (new)
          316                 *dst->str = '\0';
          317         strcat(dst->str, src);
          318 }
          319 
          320 static void
          321 strnacat(String *dst, char *src, size_t n)
          322 {
          323         int new = !dst->cap;
          324         size_t len;
          325 
          326         len = strlen(src);
          327         len = (new ? 0 : strlen(dst->str)) + MIN(n, len) + 1;
          328         if (dst->cap < len)
          329                 resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
          330         if (new)
          331                 *dst->str = '\0';
          332         strlcat(dst->str, src, len);
          333 }
          334 
          335 static void
          336 stracpy(String *dst, char *src)
          337 {
          338         size_t len;
          339 
          340         len = strlen(src) + 1;
          341         if (dst->cap < len)
          342                 resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
          343         strcpy(dst->str, src);
          344 }
          345 
          346 static void
          347 leprintf(char *s)
          348 {
          349         if (errno)
          350                 eprintf("%zu: %s: %s\n", lineno, s, strerror(errno));
          351         else
          352                 eprintf("%zu: %s\n", lineno, s);
          353 }
          354 
          355 /* FIXME: write usage message */
          356 static void
          357 usage(void)
          358 {
          359         eprintf("usage: sed [-nrE] script [file ...]\n"
          360                 "       sed [-nrE] -e script [-e script] ... [-f scriptfile] ... [file ...]\n"
          361                 "       sed [-nrE] [-e script] ... -f scriptfile [-f scriptfile] ... [file ...]\n");
          362 }
          363 
          364 /* Differences from POSIX
          365  * we allows semicolons and trailing blanks inside {}
          366  * we allow spaces after ! (and in between !s)
          367  * we allow extended regular expressions (-E)
          368  */
          369 static void
          370 compile(char *s, int isfile)
          371 {
          372         FILE *f;
          373 
          374         if (isfile) {
          375                 f = fopen(s, "r");
          376                 if (!f)
          377                         eprintf("fopen %s:", s);
          378         } else {
          379                 if (!*s) /* empty string script */
          380                         return;
          381                 f = fmemopen(s, strlen(s), "r");
          382                 if (!f)
          383                         eprintf("fmemopen:");
          384         }
          385 
          386         /* NOTE: get arg functions can't use genbuf */
          387         while (read_line(f, &genbuf) != EOF) {
          388                 s = genbuf.str;
          389 
          390                 /* if the first two characters of the script are "#n" default output shall be suppressed */
          391                 if (++lineno == 1 && *s == '#' && s[1] == 'n') {
          392                         gflags.n = 1;
          393                         continue;
          394                 }
          395 
          396                 if (gflags.aci_cont) {
          397                         aci_append(pc - 1, s);
          398                         continue;
          399                 }
          400                 if (gflags.s_cont)
          401                         s = (pc - 1)->fninfo->getarg(pc - 1, s);
          402 
          403                 while (*s) {
          404                         s = chompr(s, ';');
          405                         if (!*s || *s == '#')
          406                                 break;
          407 
          408                         if ((size_t)(pc - prog) == pcap)
          409                                 resize((void **)&prog, &pcap, sizeof(*prog), pcap * 2 + 1, (void **)&pc);
          410 
          411                         pc->range.beg.type = pc->range.end.type = IGNORE;
          412                         pc->fninfo = NULL;
          413                         pc->in_match = 0;
          414 
          415                         s = make_range(&pc->range, s);
          416                         s = chomp(s);
          417                         pc->negate = *s == '!';
          418                         s = chompr(s, '!');
          419 
          420                         if (!isascii(*s) || !(pc->fninfo = &fns[(unsigned)*s])->fn)
          421                                 leprintf("bad sed function");
          422                         if (pc->range.naddr > pc->fninfo->naddr)
          423                                 leprintf("wrong number of addresses");
          424                         s++;
          425 
          426                         if (pc->fninfo->getarg)
          427                                 s = pc->fninfo->getarg(pc, s);
          428 
          429                         pc++;
          430                 }
          431         }
          432 
          433         fshut(f, s);
          434 }
          435 
          436 /* FIXME: if we decide to honor lack of trailing newline, set/clear a global
          437  * flag when reading a line
          438  */
          439 static int
          440 read_line(FILE *f, String *s)
          441 {
          442         ssize_t len;
          443 
          444         if (!f)
          445                 return EOF;
          446 
          447         if ((len = getline(&s->str, &s->cap, f)) < 0) {
          448                 if (ferror(f))
          449                         eprintf("getline:");
          450                 return EOF;
          451         }
          452         if (s->str[--len] == '\n')
          453                 s->str[len] = '\0';
          454         return 0;
          455 }
          456 
          457 /* read first range from s, return pointer to one past end of range */
          458 static char *
          459 make_range(Range *range, char *s)
          460 {
          461         s = make_addr(&range->beg, s);
          462 
          463         if (*s == ',')
          464                 s = make_addr(&range->end, s + 1);
          465         else
          466                 range->end.type = IGNORE;
          467 
          468         if      (range->beg.type == EVERY  && range->end.type == IGNORE) range->naddr = 0;
          469         else if (range->beg.type != IGNORE && range->end.type == IGNORE) range->naddr = 1;
          470         else if (range->beg.type != IGNORE && range->end.type != IGNORE) range->naddr = 2;
          471         else leprintf("this is impossible...");
          472 
          473         return s;
          474 }
          475 
          476 /* read first addr from s, return pointer to one past end of addr */
          477 static char *
          478 make_addr(Addr *addr, char *s)
          479 {
          480         Rune r;
          481         char *p = s + strlen(s);
          482         size_t rlen = echarntorune(&r, s, p - s);
          483 
          484         if (r == '$') {
          485                 addr->type = LAST;
          486                 s += rlen;
          487         } else if (isdigitrune(r)) {
          488                 addr->type = LINE;
          489                 addr->u.lineno = stol(s, &s);
          490         } else if (r == '/' || r == '\\') {
          491                 Rune delim;
          492                 if (r == '\\') {
          493                         s += rlen;
          494                         rlen = echarntorune(&r, s, p - s);
          495                 }
          496                 if (r == '\\')
          497                         leprintf("bad delimiter '\\'");
          498                 delim = r;
          499                 s += rlen;
          500                 rlen = echarntorune(&r, s, p - s);
          501                 if (r == delim) {
          502                         addr->type = LASTRE;
          503                         s += rlen;
          504                 } else {
          505                         addr->type = REGEX;
          506                         p = find_delim(s, delim, 1);
          507                         if (!*p)
          508                                 leprintf("unclosed regex");
          509                         p -= escapes(s, p, delim, 0);
          510                         *p++ = '\0';
          511                         addr->u.re = emalloc(sizeof(*addr->u.re));
          512                         eregcomp(addr->u.re, s, gflags.E ? REG_EXTENDED : 0);
          513                         s = p;
          514                 }
          515         } else {
          516                 addr->type = EVERY;
          517         }
          518 
          519         return s;
          520 }
          521 
          522 /* return pointer to first delim in s that is not escaped
          523  * and if do_brackets is set, not in [] (note possible [::], [..], [==], inside [])
          524  * return pointer to trailing nul byte if no delim found
          525  *
          526  * any escaped character that is not special is just itself (POSIX undefined)
          527  * FIXME: pull out into some util thing, will be useful for ed as well
          528  */
          529 static char *
          530 find_delim(char *s, Rune delim, int do_brackets)
          531 {
          532         enum {
          533                 OUTSIDE         , /* not in brackets */
          534                 BRACKETS_OPENING, /* last char was first [ or last two were first [^ */
          535                 BRACKETS_INSIDE , /* inside [] */
          536                 INSIDE_OPENING  , /* inside [] and last char was [ */
          537                 CLASS_INSIDE    , /* inside class [::], or colating element [..] or [==], inside [] */
          538                 CLASS_CLOSING   , /* inside class [::], or colating element [..] or [==], and last character was the respective : . or = */
          539         } state = OUTSIDE;
          540 
          541         Rune r, c = 0; /* no c won't be used uninitialized, shutup -Wall */
          542         size_t rlen;
          543         int escape = 0;
          544         char *end = s + strlen(s);
          545 
          546         for (; *s; s += rlen) {
          547                 rlen = echarntorune(&r, s, end - s);
          548 
          549                 if      (state == BRACKETS_OPENING       &&  r == '^'  ) {                            continue; }
          550                 else if (state == BRACKETS_OPENING       &&  r == ']'  ) { state  = BRACKETS_INSIDE ; continue; }
          551                 else if (state == BRACKETS_OPENING                     ) { state  = BRACKETS_INSIDE ;           }
          552 
          553                 if      (state == CLASS_CLOSING          &&  r == ']'  ) { state  = BRACKETS_INSIDE ;           }
          554                 else if (state == CLASS_CLOSING                        ) { state  = CLASS_INSIDE    ;           }
          555                 else if (state == CLASS_INSIDE           &&  r ==  c   ) { state  = CLASS_CLOSING   ;           }
          556                 else if (state == INSIDE_OPENING         && (r == ':'  ||
          557                                                              r == '.'  ||
          558                                                              r == '=') ) { state  = CLASS_INSIDE    ; c = r;    }
          559                 else if (state == INSIDE_OPENING         &&  r == ']'  ) { state  = OUTSIDE         ;           }
          560                 else if (state == INSIDE_OPENING                       ) { state  = BRACKETS_INSIDE ;           }
          561                 else if (state == BRACKETS_INSIDE        &&  r == '['  ) { state  = INSIDE_OPENING  ;           }
          562                 else if (state == BRACKETS_INSIDE        &&  r == ']'  ) { state  = OUTSIDE         ;           }
          563                 else if (state == OUTSIDE                &&  escape    ) { escape = 0               ;           }
          564                 else if (state == OUTSIDE                &&  r == '\\' ) { escape = 1               ;           }
          565                 else if (state == OUTSIDE                &&  r == delim) return s;
          566                 else if (state == OUTSIDE && do_brackets &&  r == '['  ) { state  = BRACKETS_OPENING;           }
          567         }
          568         return s;
          569 }
          570 
          571 static char *
          572 chomp(char *s)
          573 {
          574         return chompr(s, 0);
          575 }
          576 
          577 /* eat all leading whitespace and occurrences of rune */
          578 static char *
          579 chompr(char *s, Rune rune)
          580 {
          581         Rune   r;
          582         size_t rlen;
          583         char  *end = s + strlen(s);
          584 
          585         while (*s && (rlen = echarntorune(&r, s, end - s)) && (isspacerune(r) || r == rune))
          586                 s += rlen;
          587         return s;
          588 }
          589 
          590 /* convert first nrunes Runes from UTF-8 string s in allocated Rune*
          591  * NOTE: sequence must be valid UTF-8, check first */
          592 static Rune *
          593 strtorunes(char *s, size_t nrunes)
          594 {
          595         Rune *rs, *rp;
          596 
          597         rp = rs = ereallocarray(NULL, nrunes + 1, sizeof(*rs));
          598 
          599         while (nrunes--)
          600                 s += chartorune(rp++, s);
          601 
          602         *rp = '\0';
          603         return rs;
          604 }
          605 
          606 static long
          607 stol(char *s, char **endp)
          608 {
          609         long n;
          610         errno = 0;
          611         n = strtol(s, endp, 10);
          612 
          613         if (errno)
          614                 leprintf("strtol:");
          615         if (*endp == s)
          616                 leprintf("strtol: invalid number");
          617 
          618         return n;
          619 }
          620 
          621 /* from beg to end replace "\\d" with "d" and "\\n" with "\n" (where d is delim)
          622  * if delim is 'n' and n_newline is 0 then "\\n" is replaced with "n" (normal)
          623  * if delim is 'n' and n_newline is 1 then "\\n" is replaced with "\n" (y command)
          624  * if delim is 0 all escaped characters represent themselves (aci text)
          625  * memmove rest of string (beyond end) into place
          626  * return the number of converted escapes (backslashes removed)
          627  * FIXME: this has had too many corner cases slapped on and is ugly. rewrite better
          628  */
          629 static size_t
          630 escapes(char *beg, char *end, Rune delim, int n_newline)
          631 {
          632         size_t num = 0;
          633         char *src = beg, *dst = beg;
          634 
          635         while (src < end) {
          636                 /* handle escaped backslash specially so we don't think the second
          637                  * backslash is escaping something */
          638                 if (*src == '\\' && src[1] == '\\') {
          639                         *dst++ = *src++;
          640                         if (delim)
          641                                 *dst++ = *src++;
          642                         else
          643                                 src++;
          644                 } else if (*src == '\\' && !delim) {
          645                         src++;
          646                 } else if (*src == '\\' && src[1]) {
          647                         Rune r;
          648                         size_t rlen;
          649                         num++;
          650                         src++;
          651                         rlen = echarntorune(&r, src, end - src);
          652 
          653                         if (r == 'n' && delim == 'n') {
          654                                 *src = n_newline ? '\n' : 'n'; /* src so we can still memmove() */
          655                         } else if (r == 'n') {
          656                                 *src = '\n';
          657                         } else if (r != delim) {
          658                                 *dst++ = '\\';
          659                                 num--;
          660                         }
          661 
          662                         memmove(dst, src, rlen);
          663                         dst += rlen;
          664                         src += rlen;
          665                 } else {
          666                         *dst++ = *src++;
          667                 }
          668         }
          669         memmove(dst, src, strlen(src) + 1);
          670         return num;
          671 }
          672 
          673 static size_t
          674 echarntorune(Rune *r, char *s, size_t n)
          675 {
          676         size_t rlen = charntorune(r, s, n);
          677         if (!rlen || *r == Runeerror)
          678                 leprintf("invalid UTF-8");
          679         return rlen;
          680 }
          681 
          682 static void
          683 insert_labels(void)
          684 {
          685         size_t i;
          686         Cmd *from, *to;
          687 
          688         while (branches.size) {
          689                 from = prog + (ptrdiff_t)pop(&branches);
          690 
          691                 if (!from->u.label) {/* no label branch to end of script */
          692                         from->u.jump = pc - 1;
          693                 } else {
          694                         for (i = 0; i < labels.size; i++) {
          695                                 to = prog + (ptrdiff_t)labels.data[i];
          696                                 if (!strcmp(from->u.label, to->u.label)) {
          697                                         from->u.jump = to;
          698                                         break;
          699                                 }
          700                         }
          701                         if (i == labels.size)
          702                                 leprintf("bad label");
          703                 }
          704         }
          705 }
          706 
          707 /*
          708  * Getargs / Freeargs
          709  * Read argument from s, return pointer to one past last character of argument
          710  */
          711 
          712 /* POSIX compliant
          713  * i\
          714  * foobar
          715  *
          716  * also allow the following non POSIX compliant
          717  * i        # empty line
          718  * ifoobar
          719  * ifoobar\
          720  * baz
          721  *
          722  * FIXME: GNU and busybox discard leading spaces
          723  * i  foobar
          724  * i foobar
          725  * ifoobar
          726  * are equivalent in GNU and busybox. We don't. Should we?
          727  */
          728 static char *
          729 get_aci_arg(Cmd *c, char *s)
          730 {
          731         c->u.acir.print = check_puts;
          732         c->u.acir.str = (String){ NULL, 0 };
          733 
          734         gflags.aci_cont = !!*s; /* no continue flag if empty string */
          735 
          736         /* neither empty string nor POSIX compliant */
          737         if (*s && !(*s == '\\' && !s[1]))
          738                 aci_append(c, s);
          739 
          740         return s + strlen(s);
          741 }
          742 
          743 static void
          744 aci_append(Cmd *c, char *s)
          745 {
          746         char *end = s + strlen(s), *p = end;
          747 
          748         gflags.aci_cont = 0;
          749         while (--p >= s && *p == '\\')
          750                 gflags.aci_cont = !gflags.aci_cont;
          751 
          752         if (gflags.aci_cont)
          753                 *--end = '\n';
          754 
          755         escapes(s, end, 0, 0);
          756         stracat(&c->u.acir.str, s);
          757 }
          758 
          759 static void
          760 free_acir_arg(Cmd *c)
          761 {
          762         free(c->u.acir.str.str);
          763 }
          764 
          765 /* POSIX dictates that label is rest of line, including semicolons, trailing
          766  * whitespace, closing braces, etc. and can be limited to 8 bytes
          767  *
          768  * I allow a semicolon or closing brace to terminate a label name, it's not
          769  * POSIX compliant, but it's useful and every sed version I've tried to date
          770  * does the same.
          771  *
          772  * FIXME: POSIX dictates that leading whitespace is ignored but trailing
          773  * whitespace is not. This is annoying and we should probably get rid of it.
          774  */
          775 static char *
          776 get_bt_arg(Cmd *c, char *s)
          777 {
          778         char *p = semicolon_arg(s = chomp(s));
          779 
          780         if (p != s) {
          781                 c->u.label = estrndup(s, p - s);
          782         } else {
          783                 c->u.label = NULL;
          784         }
          785 
          786         push(&branches, (void *)(c - prog));
          787 
          788         return p;
          789 }
          790 
          791 /* POSIX dictates file name is rest of line including semicolons, trailing
          792  * whitespace, closing braces, etc. and file name must be preceded by a space
          793  *
          794  * I allow a semicolon or closing brace to terminate a file name and don't
          795  * enforce leading space.
          796  *
          797  * FIXME: decide whether trailing whitespace should be included and fix
          798  * accordingly
          799  */
          800 static char *
          801 get_r_arg(Cmd *c, char *s)
          802 {
          803         char *p = semicolon_arg(s = chomp(s));
          804 
          805         if (p == s)
          806                 leprintf("no file name");
          807 
          808         c->u.acir.str.str = estrndup(s, p - s);
          809         c->u.acir.print = write_file;
          810 
          811         return p;
          812 }
          813 
          814 /* we allow "\\n" in replacement text to mean "\n" (undefined in POSIX)
          815  *
          816  * FIXME: allow other escapes in regex and replacement? if so change escapes()
          817  */
          818 static char *
          819 get_s_arg(Cmd *c, char *s)
          820 {
          821         Rune delim, r;
          822         Cmd buf;
          823         char *p;
          824         int esc, lastre;
          825 
          826         /* s/Find/Replace/Flags */
          827 
          828         /* Find */
          829         if (!gflags.s_cont) { /* NOT continuing from literal newline in replacement text */
          830                 lastre = 0;
          831                 c->u.s.repl = (String){ NULL, 0 };
          832                 c->u.s.occurrence = 1;
          833                 c->u.s.file = NULL;
          834                 c->u.s.p = 0;
          835 
          836                 if (!*s || *s == '\\')
          837                         leprintf("bad delimiter");
          838 
          839                 p = s + strlen(s);
          840                 s += echarntorune(&delim, s, p - s);
          841                 c->u.s.delim = delim;
          842 
          843                 echarntorune(&r, s, p - s);
          844                 if (r == delim) /* empty regex */
          845                         lastre = 1;
          846 
          847                 p = find_delim(s, delim, 1);
          848                 if (!*p)
          849                         leprintf("missing second delimiter");
          850                 p -= escapes(s, p, delim, 0);
          851                 *p = '\0';
          852 
          853                 if (lastre) {
          854                         c->u.s.re = NULL;
          855                 } else {
          856                         c->u.s.re = emalloc(sizeof(*c->u.s.re));
          857                         /* FIXME: different eregcomp that calls fatal */
          858                         eregcomp(c->u.s.re, s, gflags.E ? REG_EXTENDED : 0);
          859                 }
          860                 s = p + runelen(delim);
          861         }
          862 
          863         /* Replace */
          864         delim = c->u.s.delim;
          865 
          866         p = find_delim(s, delim, 0);
          867         p -= escapes(s, p, delim, 0);
          868         if (!*p) { /* no third delimiter */
          869                 /* FIXME: same backslash counting as aci_append() */
          870                 if (p[-1] != '\\')
          871                         leprintf("missing third delimiter or <backslash><newline>");
          872                 p[-1] = '\n';
          873                 gflags.s_cont = 1;
          874         } else {
          875                 gflags.s_cont = 0;
          876         }
          877 
          878         /* check for bad references in replacement text */
          879         *p = '\0';
          880         for (esc = 0, p = s; *p; p++) {
          881                 if (esc) {
          882                         esc = 0;
          883                         if (isdigit(*p) && c->u.s.re && (size_t)(*p - '0') > c->u.s.re->re_nsub)
          884                                 leprintf("back reference number greater than number of groups");
          885                 } else if (*p == '\\') {
          886                         esc = 1;
          887                 }
          888         }
          889         stracat(&c->u.s.repl, s);
          890 
          891         if (gflags.s_cont)
          892                 return p;
          893 
          894         s = p + runelen(delim);
          895 
          896         /* Flags */
          897         p = semicolon_arg(s = chomp(s));
          898 
          899         /* FIXME: currently for simplicity take last of g or occurrence flags and
          900          *        ignore multiple p flags. need to fix that */
          901         for (; s < p; s++) {
          902                 if (isdigit(*s)) {
          903                         c->u.s.occurrence = stol(s, &s);
          904                         s--; /* for loop will advance pointer */
          905                 } else {
          906                         switch (*s) {
          907                         case 'g': c->u.s.occurrence = 0; break;
          908                         case 'p': c->u.s.p = 1;          break;
          909                         case 'w':
          910                                 /* must be last flag, take everything up to newline/semicolon
          911                                  * s == p after this */
          912                                 s = get_w_arg(&buf, chomp(s+1));
          913                                 c->u.s.file = buf.u.file;
          914                                 break;
          915                         }
          916                 }
          917         }
          918         return p;
          919 }
          920 
          921 static void
          922 free_s_arg(Cmd *c)
          923 {
          924         if (c->u.s.re)
          925                 regfree(c->u.s.re);
          926         free(c->u.s.re);
          927         free(c->u.s.repl.str);
          928 }
          929 
          930 /* see get_r_arg notes */
          931 static char *
          932 get_w_arg(Cmd *c, char *s)
          933 {
          934         char *p = semicolon_arg(s = chomp(s));
          935         Wfile *w, **wp;
          936 
          937         if (p == s)
          938                 leprintf("no file name");
          939 
          940         for (wp = (Wfile **)wfiles.data; (size_t)(wp - (Wfile **)wfiles.data) < wfiles.size; wp++) {
          941                 if (strlen((*wp)->path) == (size_t)(p - s) && !strncmp(s, (*wp)->path, p - s)) {
          942                         c->u.file = (*wp)->file;
          943                         return p;
          944                 }
          945         }
          946 
          947         w = emalloc(sizeof(*w));
          948         w->path = estrndup(s, p - s);
          949 
          950         if (!(w->file = fopen(w->path, "w")))
          951                 leprintf("fopen failed");
          952 
          953         c->u.file = w->file;
          954 
          955         push(&wfiles, w);
          956         return p;
          957 }
          958 
          959 static char *
          960 get_y_arg(Cmd *c, char *s)
          961 {
          962         Rune delim;
          963         char *p = s + strlen(s);
          964         size_t rlen = echarntorune(&delim, s, p - s);
          965         size_t nrunes1, nrunes2;
          966 
          967         c->u.y.set1 = c->u.y.set2 = NULL;
          968 
          969         s += rlen;
          970         p = find_delim(s, delim, 0);
          971         p -= escapes(s, p, delim, 1);
          972         nrunes1 = utfnlen(s, p - s);
          973         c->u.y.set1 = strtorunes(s, nrunes1);
          974 
          975         s = p + rlen;
          976         p = find_delim(s, delim, 0);
          977         p -= escapes(s, p, delim, 1);
          978         nrunes2 = utfnlen(s, p - s);
          979 
          980         if (nrunes1 != nrunes2)
          981                 leprintf("different set lengths");
          982 
          983         c->u.y.set2 = strtorunes(s, utfnlen(s, p - s));
          984 
          985         return p + rlen;
          986 }
          987 
          988 static void
          989 free_y_arg(Cmd *c)
          990 {
          991         free(c->u.y.set1);
          992         free(c->u.y.set2);
          993 }
          994 
          995 /* see get_bt_arg notes */
          996 static char *
          997 get_colon_arg(Cmd *c, char *s)
          998 {
          999         char *p = semicolon_arg(s = chomp(s));
         1000 
         1001         if (p == s)
         1002                 leprintf("no label name");
         1003 
         1004         c->u.label = estrndup(s, p - s);
         1005         push(&labels, (void *)(c - prog));
         1006         return p;
         1007 }
         1008 
         1009 static char *
         1010 get_lbrace_arg(Cmd *c, char *s)
         1011 {
         1012         push(&braces, (void *)(c - prog));
         1013         return s;
         1014 }
         1015 
         1016 static char *
         1017 get_rbrace_arg(Cmd *c, char *s)
         1018 {
         1019         Cmd *lbrace;
         1020 
         1021         if (!braces.size)
         1022                 leprintf("extra }");
         1023 
         1024         lbrace = prog + (ptrdiff_t)pop(&braces);
         1025         lbrace->u.offset = c - prog;
         1026         return s;
         1027 }
         1028 
         1029 /* s points to beginning of an argument that may be semicolon terminated
         1030  * return pointer to semicolon or nul byte after string
         1031  * or closing brace as to not force ; before }
         1032  * FIXME: decide whether or not to eat trailing whitespace for arguments that
         1033  *        we allow semicolon/brace termination that POSIX doesn't
         1034  *        b, r, t, w, :
         1035  *        POSIX says trailing whitespace is part of label name, file name, etc.
         1036  *        we should probably eat it
         1037  */
         1038 static char *
         1039 semicolon_arg(char *s)
         1040 {
         1041         char *p = strpbrk(s, ";}");
         1042         if (!p)
         1043                 p = s + strlen(s);
         1044         return p;
         1045 }
         1046 
         1047 static void
         1048 run(void)
         1049 {
         1050         lineno = 0;
         1051         if (braces.size)
         1052                 leprintf("extra {");
         1053 
         1054         /* genbuf has already been initialized, patt will be in new_line
         1055          * (or we'll halt) */
         1056         stracpy(&hold, "");
         1057 
         1058         insert_labels();
         1059         next_file();
         1060         new_line();
         1061 
         1062         for (pc = prog; !gflags.halt; pc++)
         1063                 pc->fninfo->fn(pc);
         1064 }
         1065 
         1066 /* return true if we are in range for c, set c->in_match appropriately */
         1067 static int
         1068 in_range(Cmd *c)
         1069 {
         1070         if (match_addr(&c->range.beg)) {
         1071                 if (c->range.naddr == 2) {
         1072                         if (c->range.end.type == LINE && c->range.end.u.lineno <= lineno)
         1073                                 c->in_match = 0;
         1074                         else
         1075                                 c->in_match = 1;
         1076                 }
         1077                 return !c->negate;
         1078         }
         1079         if (c->in_match && match_addr(&c->range.end)) {
         1080                 c->in_match = 0;
         1081                 return !c->negate;
         1082         }
         1083         return c->in_match ^ c->negate;
         1084 }
         1085 
         1086 /* return true if addr matches current line */
         1087 static int
         1088 match_addr(Addr *a)
         1089 {
         1090         switch (a->type) {
         1091         default:
         1092         case IGNORE: return 0;
         1093         case EVERY: return 1;
         1094         case LINE: return lineno == a->u.lineno;
         1095         case LAST:
         1096                 while (is_eof(file) && !next_file())
         1097                         ;
         1098                 return !file;
         1099         case REGEX:
         1100                 lastre = a->u.re;
         1101                 return !regexec(a->u.re, patt.str, 0, NULL, 0);
         1102         case LASTRE:
         1103                 if (!lastre)
         1104                         leprintf("no previous regex");
         1105                 return !regexec(lastre, patt.str, 0, NULL, 0);
         1106         }
         1107 }
         1108 
         1109 /* move to next input file
         1110  * stdin if first call and no files
         1111  * return 0 for success and 1 for no more files
         1112  */
         1113 static int
         1114 next_file(void)
         1115 {
         1116         static unsigned char first = 1;
         1117 
         1118         if (file == stdin)
         1119                 clearerr(file);
         1120         else if (file)
         1121                 fshut(file, "<file>");
         1122         /* given no files, default to stdin */
         1123         file = first && !*files ? stdin : NULL;
         1124         first = 0;
         1125 
         1126         while (!file && *files) {
         1127                 if (!strcmp(*files, "-")) {
         1128                         file = stdin;
         1129                 } else if (!(file = fopen(*files, "r"))) {
         1130                         /* warn this file didn't open, but move on to next */
         1131                         weprintf("fopen %s:", *files);
         1132                         ret = 1;
         1133                 }
         1134                 files++;
         1135         }
         1136 
         1137         return !file;
         1138 }
         1139 
         1140 /* test if stream is at EOF */
         1141 static int
         1142 is_eof(FILE *f)
         1143 {
         1144         int c;
         1145 
         1146         if (!f || feof(f))
         1147                 return 1;
         1148 
         1149         c = fgetc(f);
         1150         if (c == EOF && ferror(f))
         1151                 eprintf("fgetc:");
         1152         if (c != EOF && ungetc(c, f) == EOF)
         1153                 eprintf("ungetc EOF\n");
         1154 
         1155         return c == EOF;
         1156 }
         1157 
         1158 /* perform writes that were scheduled
         1159  * for aci this is check_puts(string, stdout)
         1160  * for r this is write_file(path, stdout)
         1161  */
         1162 static void
         1163 do_writes(void)
         1164 {
         1165         Cmd *c;
         1166         size_t i;
         1167 
         1168         for (i = 0; i < writes.size; i++) {
         1169                 c = writes.data[i];
         1170                 c->u.acir.print(c->u.acir.str.str, stdout);
         1171         }
         1172         writes.size = 0;
         1173 }
         1174 
         1175 /* used for r's u.acir.print()
         1176  * FIXME: something like util's concat() would be better
         1177  */
         1178 static void
         1179 write_file(char *path, FILE *out)
         1180 {
         1181         FILE *in = fopen(path, "r");
         1182         if (!in) /* no file is treated as empty file */
         1183                 return;
         1184 
         1185         while (read_line(in, &genbuf) != EOF)
         1186                 check_puts(genbuf.str, out);
         1187 
         1188         fshut(in, path);
         1189 }
         1190 
         1191 static void
         1192 check_puts(char *s, FILE *f)
         1193 {
         1194         if (s && fputs(s, f) == EOF)
         1195                 eprintf("fputs:");
         1196         if (fputs("\n", f) == EOF)
         1197                 eprintf("fputs:");
         1198 }
         1199 
         1200 /* iterate from beg to end updating ranges so we don't miss any commands
         1201  * e.g. sed -n '1d;1,3p' should still print lines 2 and 3
         1202  */
         1203 static void
         1204 update_ranges(Cmd *beg, Cmd *end)
         1205 {
         1206         while (beg < end)
         1207                 in_range(beg++);
         1208 }
         1209 
         1210 /*
         1211  * Sed functions
         1212  */
         1213 static void
         1214 cmd_a(Cmd *c)
         1215 {
         1216         if (in_range(c))
         1217                 push(&writes, c);
         1218 }
         1219 
         1220 static void
         1221 cmd_b(Cmd *c)
         1222 {
         1223         if (!in_range(c))
         1224                 return;
         1225 
         1226         /* if we jump backwards update to end, otherwise update to destination */
         1227         update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap);
         1228         pc = c->u.jump;
         1229 }
         1230 
         1231 static void
         1232 cmd_c(Cmd *c)
         1233 {
         1234         if (!in_range(c))
         1235                 return;
         1236 
         1237         /* write the text on the last line of the match */
         1238         if (!c->in_match)
         1239                 check_puts(c->u.acir.str.str, stdout);
         1240         /* otherwise start the next cycle without printing pattern space
         1241          * effectively deleting the text */
         1242         new_next();
         1243 }
         1244 
         1245 static void
         1246 cmd_d(Cmd *c)
         1247 {
         1248         if (!in_range(c))
         1249                 return;
         1250 
         1251         new_next();
         1252 }
         1253 
         1254 static void
         1255 cmd_D(Cmd *c)
         1256 {
         1257         char *p;
         1258 
         1259         if (!in_range(c))
         1260                 return;
         1261 
         1262         if ((p = strchr(patt.str, '\n'))) {
         1263                 p++;
         1264                 memmove(patt.str, p, strlen(p) + 1);
         1265                 old_next();
         1266         } else {
         1267                 new_next();
         1268         }
         1269 }
         1270 
         1271 static void
         1272 cmd_g(Cmd *c)
         1273 {
         1274         if (in_range(c))
         1275                 stracpy(&patt, hold.str);
         1276 }
         1277 
         1278 static void
         1279 cmd_G(Cmd *c)
         1280 {
         1281         if (!in_range(c))
         1282                 return;
         1283 
         1284         stracat(&patt, "\n");
         1285         stracat(&patt, hold.str);
         1286 }
         1287 
         1288 static void
         1289 cmd_h(Cmd *c)
         1290 {
         1291         if (in_range(c))
         1292                 stracpy(&hold, patt.str);
         1293 }
         1294 
         1295 static void
         1296 cmd_H(Cmd *c)
         1297 {
         1298         if (!in_range(c))
         1299                 return;
         1300 
         1301         stracat(&hold, "\n");
         1302         stracat(&hold, patt.str);
         1303 }
         1304 
         1305 static void
         1306 cmd_i(Cmd *c)
         1307 {
         1308         if (in_range(c))
         1309                 check_puts(c->u.acir.str.str, stdout);
         1310 }
         1311 
         1312 /* I think it makes sense to print invalid UTF-8 sequences in octal to satisfy
         1313  * the "visually unambiguous form" sed(1p)
         1314  */
         1315 static void
         1316 cmd_l(Cmd *c)
         1317 {
         1318         Rune   r;
         1319         char  *p, *end;
         1320         size_t rlen;
         1321 
         1322         char *escapes[] = { /* FIXME: 7 entries and search instead of 127 */
         1323                 ['\\'] = "\\\\", ['\a'] = "\\a", ['\b'] = "\\b",
         1324                 ['\f'] = "\\f" , ['\r'] = "\\r", ['\t'] = "\\t",
         1325                 ['\v'] = "\\v" , [0x7f] = NULL, /* fill out the table */
         1326         };
         1327 
         1328         if (!in_range(c))
         1329                 return;
         1330 
         1331         /* FIXME: line wrapping. sed(1p) says "length at which folding occurs is
         1332          * unspecified, but should be appropraite for the output device"
         1333          * just wrap at 80 Runes?
         1334          */
         1335         for (p = patt.str, end = p + strlen(p); p < end; p += rlen) {
         1336                 if (isascii(*p) && escapes[(unsigned int)*p]) {
         1337                         fputs(escapes[(unsigned int)*p], stdout);
         1338                         rlen = 1;
         1339                 } else if (!(rlen = charntorune(&r, p, end - p))) {
         1340                         /* ran out of chars, print the bytes of the short sequence */
         1341                         for (; p < end; p++)
         1342                                 printf("\\%03hho", (unsigned char)*p);
         1343                         break;
         1344                 } else if (r == Runeerror) {
         1345                         for (; rlen; rlen--, p++)
         1346                                 printf("\\%03hho", (unsigned char)*p);
         1347                 } else {
         1348                         while (fwrite(p, rlen, 1, stdout) < 1 && errno == EINTR)
         1349                                 ;
         1350                         if (ferror(stdout))
         1351                                 eprintf("fwrite:");
         1352                 }
         1353         }
         1354         check_puts("$", stdout);
         1355 }
         1356 
         1357 static void
         1358 cmd_n(Cmd *c)
         1359 {
         1360         if (!in_range(c))
         1361                 return;
         1362 
         1363         if (!gflags.n)
         1364                 check_puts(patt.str, stdout);
         1365         do_writes();
         1366         new_line();
         1367 }
         1368 
         1369 static void
         1370 cmd_N(Cmd *c)
         1371 {
         1372         if (!in_range(c))
         1373                 return;
         1374         do_writes();
         1375         app_line();
         1376 }
         1377 
         1378 static void
         1379 cmd_p(Cmd *c)
         1380 {
         1381         if (in_range(c))
         1382                 check_puts(patt.str, stdout);
         1383 }
         1384 
         1385 static void
         1386 cmd_P(Cmd *c)
         1387 {
         1388         char *p;
         1389 
         1390         if (!in_range(c))
         1391                 return;
         1392 
         1393         if ((p = strchr(patt.str, '\n')))
         1394                 *p = '\0';
         1395 
         1396         check_puts(patt.str, stdout);
         1397 
         1398         if (p)
         1399                 *p = '\n';
         1400 }
         1401 
         1402 static void
         1403 cmd_q(Cmd *c)
         1404 {
         1405         if (!in_range(c))
         1406                 return;
         1407 
         1408         if (!gflags.n)
         1409                 check_puts(patt.str, stdout);
         1410         do_writes();
         1411         gflags.halt = 1;
         1412 }
         1413 
         1414 static void
         1415 cmd_r(Cmd *c)
         1416 {
         1417         if (in_range(c))
         1418                 push(&writes, c);
         1419 }
         1420 
         1421 static void
         1422 cmd_s(Cmd *c)
         1423 {
         1424         String tmp;
         1425         Rune r;
         1426         size_t plen, rlen, len;
         1427         char *p, *s, *end;
         1428         unsigned int matches = 0, last_empty = 1, qflag = 0, cflags = 0;
         1429         regex_t *re;
         1430         regmatch_t *rm, *pmatch = NULL;
         1431 
         1432         if (!in_range(c))
         1433                 return;
         1434 
         1435         if (!c->u.s.re && !lastre)
         1436                 leprintf("no previous regex");
         1437 
         1438         re = c->u.s.re ? c->u.s.re : lastre;
         1439         lastre = re;
         1440 
         1441         plen = re->re_nsub + 1;
         1442         pmatch = ereallocarray(NULL, plen, sizeof(regmatch_t));
         1443 
         1444         *genbuf.str = '\0';
         1445         s = patt.str;
         1446 
         1447         while (!qflag && !regexec(re, s, plen, pmatch, cflags)) {
         1448                 cflags = REG_NOTBOL; /* match against beginning of line first time, but not again */
         1449                 if (!*s) /* match against empty string first time, but not again */
         1450                         qflag = 1;
         1451 
         1452                 /* don't substitute if last match was not empty but this one is.
         1453                  * s_a*_._g
         1454                  * foobar -> .f.o.o.b.r.
         1455                  */
         1456                 if ((last_empty || pmatch[0].rm_eo) &&
         1457                     (++matches == c->u.s.occurrence || !c->u.s.occurrence)) {
         1458                         /* copy over everything before the match */
         1459                         strnacat(&genbuf, s, pmatch[0].rm_so);
         1460 
         1461                         /* copy over replacement text, taking into account &, backreferences, and \ escapes */
         1462                         for (p = c->u.s.repl.str, len = strcspn(p, "\\&"); *p; len = strcspn(++p, "\\&")) {
         1463                                 strnacat(&genbuf, p, len);
         1464                                 p += len;
         1465                                 switch (*p) {
         1466                                 default: leprintf("this shouldn't be possible");
         1467                                 case '\0':
         1468                                         /* we're at the end, back up one so the ++p will put us on
         1469                                          * the null byte to break out of the loop */
         1470                                         --p;
         1471                                         break;
         1472                                 case '&':
         1473                                         strnacat(&genbuf, s + pmatch[0].rm_so, pmatch[0].rm_eo - pmatch[0].rm_so);
         1474                                         break;
         1475                                 case '\\':
         1476                                         if (isdigit(*++p)) { /* backreference */
         1477                                                 /* only need to check here if using lastre, otherwise we checked when building */
         1478                                                 if (!c->u.s.re && (size_t)(*p - '0') > re->re_nsub)
         1479                                                         leprintf("back reference number greater than number of groups");
         1480                                                 rm = &pmatch[*p - '0'];
         1481                                                 strnacat(&genbuf, s + rm->rm_so, rm->rm_eo - rm->rm_so);
         1482                                         } else { /* character after backslash taken literally (well one byte, but it works) */
         1483                                                 strnacat(&genbuf, p, 1);
         1484                                         }
         1485                                         break;
         1486                                 }
         1487                         }
         1488                 } else {
         1489                         /* not replacing, copy over everything up to and including the match */
         1490                         strnacat(&genbuf, s, pmatch[0].rm_eo);
         1491                 }
         1492 
         1493                 if (!pmatch[0].rm_eo) { /* empty match, advance one rune and add it to output */
         1494                         end = s + strlen(s);
         1495                         rlen = charntorune(&r, s, end - s);
         1496 
         1497                         if (!rlen) { /* ran out of bytes, copy short sequence */
         1498                                 stracat(&genbuf, s);
         1499                                 s = end;
         1500                         } else { /* copy whether or not it's a good rune */
         1501                                 strnacat(&genbuf, s, rlen);
         1502                                 s += rlen;
         1503                         }
         1504                 }
         1505                 last_empty = !pmatch[0].rm_eo;
         1506                 s += pmatch[0].rm_eo;
         1507         }
         1508         free(pmatch);
         1509 
         1510         if (!(matches && matches >= c->u.s.occurrence)) /* no replacement */
         1511                 return;
         1512 
         1513         gflags.s = 1;
         1514 
         1515         stracat(&genbuf, s);
         1516 
         1517         tmp    = patt;
         1518         patt   = genbuf;
         1519         genbuf = tmp;
         1520 
         1521         if (c->u.s.p)
         1522                 check_puts(patt.str, stdout);
         1523         if (c->u.s.file)
         1524                 check_puts(patt.str, c->u.s.file);
         1525 }
         1526 
         1527 static void
         1528 cmd_t(Cmd *c)
         1529 {
         1530         if (!in_range(c) || !gflags.s)
         1531                 return;
         1532 
         1533         /* if we jump backwards update to end, otherwise update to destination */
         1534         update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap);
         1535         pc = c->u.jump;
         1536         gflags.s = 0;
         1537 }
         1538 
         1539 static void
         1540 cmd_w(Cmd *c)
         1541 {
         1542         if (in_range(c))
         1543                 check_puts(patt.str, c->u.file);
         1544 }
         1545 
         1546 static void
         1547 cmd_x(Cmd *c)
         1548 {
         1549         String tmp;
         1550 
         1551         if (!in_range(c))
         1552                 return;
         1553 
         1554         tmp  = patt;
         1555         patt = hold;
         1556         hold = tmp;
         1557 }
         1558 
         1559 static void
         1560 cmd_y(Cmd *c)
         1561 {
         1562         String tmp;
         1563         Rune r, *rp;
         1564         size_t n, rlen;
         1565         char *s, *end, buf[UTFmax];
         1566 
         1567         if (!in_range(c))
         1568                 return;
         1569 
         1570         *genbuf.str = '\0';
         1571         for (s = patt.str, end = s + strlen(s); *s; s += rlen) {
         1572                 if (!(rlen = charntorune(&r, s, end - s))) { /* ran out of chars, copy rest */
         1573                         stracat(&genbuf, s);
         1574                         break;
         1575                 } else if (r == Runeerror) { /* bad UTF-8 sequence, copy bytes */
         1576                         strnacat(&genbuf, s, rlen);
         1577                 } else {
         1578                         for (rp = c->u.y.set1; *rp; rp++)
         1579                                 if (*rp == r)
         1580                                         break;
         1581                         if (*rp) { /* found r in set1, replace with Rune from set2 */
         1582                                 n = runetochar(buf, c->u.y.set2 + (rp - c->u.y.set1));
         1583                                 strnacat(&genbuf, buf, n);
         1584                         } else {
         1585                                 strnacat(&genbuf, s, rlen);
         1586                         }
         1587                 }
         1588         }
         1589         tmp    = patt;
         1590         patt   = genbuf;
         1591         genbuf = tmp;
         1592 }
         1593 
         1594 static void
         1595 cmd_colon(Cmd *c)
         1596 {
         1597 }
         1598 
         1599 static void
         1600 cmd_equal(Cmd *c)
         1601 {
         1602         if (in_range(c))
         1603                 printf("%zu\n", lineno);
         1604 }
         1605 
         1606 static void
         1607 cmd_lbrace(Cmd *c)
         1608 {
         1609         Cmd *jump;
         1610 
         1611         if (in_range(c))
         1612                 return;
         1613 
         1614         /* update ranges on all commands we skip */
         1615         jump = prog + c->u.offset;
         1616         update_ranges(c + 1, jump);
         1617         pc = jump;
         1618 }
         1619 
         1620 static void
         1621 cmd_rbrace(Cmd *c)
         1622 {
         1623 }
         1624 
         1625 /* not actually a sed function, but acts like one, put in last spot of script */
         1626 static void
         1627 cmd_last(Cmd *c)
         1628 {
         1629         if (!gflags.n)
         1630                 check_puts(patt.str, stdout);
         1631         do_writes();
         1632         new_next();
         1633 }
         1634 
         1635 /*
         1636  * Actions
         1637  */
         1638 
         1639 /* read new line, continue current cycle */
         1640 static void
         1641 new_line(void)
         1642 {
         1643         while (read_line(file, &patt) == EOF) {
         1644                 if (next_file()) {
         1645                         gflags.halt = 1;
         1646                         return;
         1647                 }
         1648         }
         1649         gflags.s = 0;
         1650         lineno++;
         1651 }
         1652 
         1653 /* append new line, continue current cycle
         1654  * FIXME: used for N, POSIX specifies do not print pattern space when out of
         1655  *        input, but GNU does so busybox does as well. Currently we don't.
         1656  *        Should we?
         1657  */
         1658 static void
         1659 app_line(void)
         1660 {
         1661         while (read_line(file, &genbuf) == EOF) {
         1662                 if (next_file()) {
         1663                         gflags.halt = 1;
         1664                         return;
         1665                 }
         1666         }
         1667 
         1668         stracat(&patt, "\n");
         1669         stracat(&patt, genbuf.str);
         1670         gflags.s = 0;
         1671         lineno++;
         1672 }
         1673 
         1674 /* read new line, start new cycle */
         1675 static void
         1676 new_next(void)
         1677 {
         1678         *patt.str = '\0';
         1679         update_ranges(pc + 1, prog + pcap);
         1680         new_line();
         1681         pc = prog - 1;
         1682 }
         1683 
         1684 /* keep old pattern space, start new cycle */
         1685 static void
         1686 old_next(void)
         1687 {
         1688         update_ranges(pc + 1, prog + pcap);
         1689         pc = prog - 1;
         1690 }
         1691 
         1692 int
         1693 main(int argc, char *argv[])
         1694 {
         1695         char *arg;
         1696         int script = 0;
         1697 
         1698         ARGBEGIN {
         1699         case 'n':
         1700                 gflags.n = 1;
         1701                 break;
         1702         case 'r':
         1703         case 'E':
         1704                 gflags.E = 1;
         1705                 break;
         1706         case 'e':
         1707                 arg = EARGF(usage());
         1708                 compile(arg, 0);
         1709                 script = 1;
         1710                 break;
         1711         case 'f':
         1712                 arg = EARGF(usage());
         1713                 compile(arg, 1);
         1714                 script = 1;
         1715                 break;
         1716         default : usage();
         1717         } ARGEND
         1718 
         1719         /* no script to run */
         1720         if (!script && !argc)
         1721                 usage();
         1722 
         1723         /* no script yet, next argument is script */
         1724         if (!script)
         1725                 compile(*argv++, 0);
         1726 
         1727         /* shrink/grow memory to fit and add our last instruction */
         1728         resize((void **)&prog, &pcap, sizeof(*prog), pc - prog + 1, NULL);
         1729         pc = prog + pcap - 1;
         1730         pc->fninfo = &(Fninfo){ cmd_last, NULL, NULL, 0 };
         1731 
         1732         files = argv;
         1733         run();
         1734 
         1735         ret |= fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>");
         1736 
         1737         return ret;
         1738 }