sort.c - sbase - suckless unix tools
 (HTM) git clone git://git.suckless.org/sbase
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       sort.c (9468B)
       ---
            1 /* See LICENSE file for copyright and license details. */
            2 #include <ctype.h>
            3 #include <stdio.h>
            4 #include <stdlib.h>
            5 #include <string.h>
            6 
            7 #include "queue.h"
            8 #include "text.h"
            9 #include "utf.h"
           10 #include "util.h"
           11 
           12 struct keydef {
           13         int start_column;
           14         int end_column;
           15         int start_char;
           16         int end_char;
           17         int flags;
           18         TAILQ_ENTRY(keydef) entry;
           19 };
           20 
           21 struct column {
           22         struct line line;
           23         size_t cap;
           24 };
           25 
           26 enum {
           27         MOD_N      = 1 << 0,
           28         MOD_STARTB = 1 << 1,
           29         MOD_ENDB   = 1 << 2,
           30         MOD_R      = 1 << 3,
           31         MOD_D      = 1 << 4,
           32         MOD_F      = 1 << 5,
           33         MOD_I      = 1 << 6,
           34 };
           35 
           36 static TAILQ_HEAD(kdhead, keydef) kdhead = TAILQ_HEAD_INITIALIZER(kdhead);
           37 
           38 static int Cflag = 0, cflag = 0, uflag = 0;
           39 static char *fieldsep = NULL;
           40 static size_t fieldseplen = 0;
           41 static struct column col1, col2;
           42 
           43 static void
           44 skipblank(struct line *a)
           45 {
           46         while (a->len && (*(a->data) == ' ' || *(a->data) == '\t')) {
           47                 a->data++;
           48                 a->len--;
           49         }
           50 }
           51 
           52 static void
           53 skipnonblank(struct line *a)
           54 {
           55         while (a->len && (*(a->data) != '\n' && *(a->data) != ' ' &&
           56                           *(a->data) != '\t')) {
           57                 a->data++;
           58                 a->len--;
           59         }
           60 }
           61 
           62 static void
           63 skipcolumn(struct line *a, int skip_to_next_col)
           64 {
           65         char *s;
           66 
           67         if (fieldsep) {
           68                 if ((s = memmem(a->data, a->len, fieldsep, fieldseplen))) {
           69                         if (skip_to_next_col)
           70                                 s += fieldseplen;
           71                         a->len -= s - a->data;
           72                         a->data = s;
           73                 } else {
           74                         a->data += a->len - 1;
           75                         a->len = 1;
           76                 }
           77         } else {
           78                 skipblank(a);
           79                 skipnonblank(a);
           80         }
           81 }
           82 
           83 static void
           84 columns(struct line *line, const struct keydef *kd, struct column *col)
           85 {
           86         Rune r;
           87         struct line start, end;
           88         size_t utflen, rlen;
           89         int i;
           90 
           91         start.data = line->data;
           92         start.len = line->len;
           93         for (i = 1; i < kd->start_column; i++)
           94                 skipcolumn(&start, 1);
           95         if (kd->flags & MOD_STARTB)
           96                 skipblank(&start);
           97         for (utflen = 0; start.len > 1 && utflen < kd->start_char - 1;) {
           98                 rlen = chartorune(&r, start.data);
           99                 start.data += rlen;
          100                 start.len -= rlen;
          101                 utflen++;
          102         }
          103 
          104         end.data = line->data;
          105         end.len = line->len;
          106         if (kd->end_column) {
          107                 for (i = 1; i < kd->end_column; i++)
          108                         skipcolumn(&end, 1);
          109                 if (kd->flags & MOD_ENDB)
          110                         skipblank(&end);
          111                 if (kd->end_char) {
          112                         for (utflen = 0; end.len > 1 && utflen < kd->end_char;) {
          113                                 rlen = chartorune(&r, end.data);
          114                                 end.data += rlen;
          115                                 end.len -= rlen;
          116                                 utflen++;
          117                         }
          118                 } else {
          119                         skipcolumn(&end, 0);
          120                 }
          121         } else {
          122                 end.data += end.len - 1;
          123                 end.len = 1;
          124         }
          125         col->line.len = MAX(0, end.data - start.data);
          126         if (!(col->line.data) || col->cap < col->line.len + 1) {
          127                 free(col->line.data);
          128                 col->line.data = emalloc(col->line.len + 1);
          129                 col->cap = col->line.len + 1;
          130         }
          131         memcpy(col->line.data, start.data, col->line.len);
          132         col->line.data[col->line.len] = '\0';
          133 }
          134 
          135 static int
          136 skipmodcmp(struct line *a, struct line *b, int flags)
          137 {
          138         Rune r1, r2;
          139         size_t offa = 0, offb = 0;
          140 
          141         do {
          142                 offa += chartorune(&r1, a->data + offa);
          143                 offb += chartorune(&r2, b->data + offb);
          144 
          145                 if (flags & MOD_D && flags & MOD_I) {
          146                         while (offa < a->len && ((!isblankrune(r1) &&
          147                                !isalnumrune(r1)) || (!isprintrune(r1))))
          148                                 offa += chartorune(&r1, a->data + offa);
          149                         while (offb < b->len && ((!isblankrune(r2) &&
          150                                !isalnumrune(r2)) || (!isprintrune(r2))))
          151                                 offb += chartorune(&r2, b->data + offb);
          152                 }
          153                 else if (flags & MOD_D) {
          154                         while (offa < a->len && !isblankrune(r1) &&
          155                                !isalnumrune(r1))
          156                                 offa += chartorune(&r1, a->data + offa);
          157                         while (offb < b->len && !isblankrune(r2) &&
          158                                !isalnumrune(r2))
          159                                 offb += chartorune(&r2, b->data + offb);
          160                 }
          161                 else if (flags & MOD_I) {
          162                         while (offa < a->len && !isprintrune(r1))
          163                                 offa += chartorune(&r1, a->data + offa);
          164                         while (offb < b->len && !isprintrune(r2))
          165                                 offb += chartorune(&r2, b->data + offb);
          166                 }
          167                 if (flags & MOD_F) {
          168                         r1 = toupperrune(r1);
          169                         r2 = toupperrune(r2);
          170                 }
          171         } while (r1 && r1 == r2);
          172 
          173         return r1 - r2;
          174 }
          175 
          176 static int
          177 slinecmp(struct line *a, struct line *b)
          178 {
          179         int res = 0;
          180         double x, y;
          181         struct keydef *kd;
          182 
          183         TAILQ_FOREACH(kd, &kdhead, entry) {
          184                 columns(a, kd, &col1);
          185                 columns(b, kd, &col2);
          186 
          187                 /* if -u is given, don't use default key definition
          188                  * unless it is the only one */
          189                 if (uflag && kd == TAILQ_LAST(&kdhead, kdhead) &&
          190                     TAILQ_LAST(&kdhead, kdhead) != TAILQ_FIRST(&kdhead)) {
          191                         res = 0;
          192                 } else if (kd->flags & MOD_N) {
          193                         x = strtod(col1.line.data, NULL);
          194                         y = strtod(col2.line.data, NULL);
          195                         res = (x < y) ? -1 : (x > y);
          196                 } else if (kd->flags & (MOD_D | MOD_F | MOD_I)) {
          197                         res = skipmodcmp(&col1.line, &col2.line, kd->flags);
          198                 } else {
          199                         res = linecmp(&col1.line, &col2.line);
          200                 }
          201 
          202                 if (kd->flags & MOD_R)
          203                         res = -res;
          204                 if (res)
          205                         break;
          206         }
          207 
          208         return res;
          209 }
          210 
          211 static int
          212 check(FILE *fp, const char *fname)
          213 {
          214         static struct line prev, cur, tmp;
          215         static size_t prevsize, cursize, tmpsize;
          216         ssize_t len;
          217 
          218         if (!prev.data) {
          219                 if ((len = getline(&prev.data, &prevsize, fp)) < 0)
          220                         eprintf("getline:");
          221                 prev.len = len;
          222         }
          223         while ((len = getline(&cur.data, &cursize, fp)) > 0) {
          224                 cur.len = len;
          225                 if (uflag > slinecmp(&cur, &prev)) {
          226                         if (!Cflag) {
          227                                 weprintf("disorder %s: ", fname);
          228                                 fwrite(cur.data, 1, cur.len, stderr);
          229                         }
          230                         return 1;
          231                 }
          232                 tmp = cur;
          233                 tmpsize = cursize;
          234                 cur = prev;
          235                 cursize = prevsize;
          236                 prev = tmp;
          237                 prevsize = tmpsize;
          238         }
          239 
          240         return 0;
          241 }
          242 
          243 static int
          244 parse_flags(char **s, int *flags, int bflag)
          245 {
          246         while (isalpha((int)**s)) {
          247                 switch (*((*s)++)) {
          248                 case 'b':
          249                         *flags |= bflag;
          250                         break;
          251                 case 'd':
          252                         *flags |= MOD_D;
          253                         break;
          254                 case 'f':
          255                         *flags |= MOD_F;
          256                         break;
          257                 case 'i':
          258                         *flags |= MOD_I;
          259                         break;
          260                 case 'n':
          261                         *flags |= MOD_N;
          262                         break;
          263                 case 'r':
          264                         *flags |= MOD_R;
          265                         break;
          266                 default:
          267                         return -1;
          268                 }
          269         }
          270 
          271         return 0;
          272 }
          273 
          274 static void
          275 addkeydef(char *kdstr, int flags)
          276 {
          277         struct keydef *kd;
          278 
          279         kd = enmalloc(2, sizeof(*kd));
          280 
          281         /* parse key definition kdstr with format
          282          * start_column[.start_char][flags][,end_column[.end_char][flags]]
          283          */
          284         kd->start_column = 1;
          285         kd->start_char = 1;
          286         kd->end_column = 0; /* 0 means end of line */
          287         kd->end_char = 0;   /* 0 means end of column */
          288         kd->flags = flags;
          289 
          290         if ((kd->start_column = strtol(kdstr, &kdstr, 10)) < 1)
          291                 enprintf(2, "invalid start column in key definition\n");
          292 
          293         if (*kdstr == '.') {
          294                 if ((kd->start_char = strtol(kdstr + 1, &kdstr, 10)) < 1)
          295                         enprintf(2, "invalid start character in key "
          296                                  "definition\n");
          297         }
          298         if (parse_flags(&kdstr, &kd->flags, MOD_STARTB) < 0)
          299                 enprintf(2, "invalid start flags in key definition\n");
          300 
          301         if (*kdstr == ',') {
          302                 if ((kd->end_column = strtol(kdstr + 1, &kdstr, 10)) < 0)
          303                         enprintf(2, "invalid end column in key definition\n");
          304                 if (*kdstr == '.') {
          305                         if ((kd->end_char = strtol(kdstr + 1, &kdstr, 10)) < 0)
          306                                 enprintf(2, "invalid end character in key "
          307                                          "definition\n");
          308                 }
          309                 if (parse_flags(&kdstr, &kd->flags, MOD_ENDB) < 0)
          310                         enprintf(2, "invalid end flags in key definition\n");
          311         }
          312 
          313         if (*kdstr != '\0')
          314                 enprintf(2, "invalid key definition\n");
          315 
          316         TAILQ_INSERT_TAIL(&kdhead, kd, entry);
          317 }
          318 
          319 static void
          320 usage(void)
          321 {
          322         enprintf(2, "usage: %s [-Cbcdfimnru] [-o outfile] [-t delim] "
          323                  "[-k def]... [file ...]\n", argv0);
          324 }
          325 
          326 int
          327 main(int argc, char *argv[])
          328 {
          329         FILE *fp, *ofp = stdout;
          330         struct linebuf linebuf = EMPTY_LINEBUF;
          331         size_t i;
          332         int global_flags = 0, ret = 0;
          333         char *outfile = NULL;
          334 
          335         ARGBEGIN {
          336         case 'C':
          337                 Cflag = 1;
          338                 break;
          339         case 'b':
          340                 global_flags |= MOD_STARTB | MOD_ENDB;
          341                 break;
          342         case 'c':
          343                 cflag = 1;
          344                 break;
          345         case 'd':
          346                 global_flags |= MOD_D;
          347                 break;
          348         case 'f':
          349                 global_flags |= MOD_F;
          350                 break;
          351         case 'i':
          352                 global_flags |= MOD_I;
          353                 break;
          354         case 'k':
          355                 addkeydef(EARGF(usage()), global_flags);
          356                 break;
          357         case 'm':
          358                 /* more or less for free, but for performance-reasons,
          359                  * we should keep this flag in mind and maybe some later
          360                  * day implement it properly so we don't run out of memory
          361                  * while merging large sorted files.
          362                  */
          363                 break;
          364         case 'n':
          365                 global_flags |= MOD_N;
          366                 break;
          367         case 'o':
          368                 outfile = EARGF(usage());
          369                 break;
          370         case 'r':
          371                 global_flags |= MOD_R;
          372                 break;
          373         case 't':
          374                 fieldsep = EARGF(usage());
          375                 if (!*fieldsep)
          376                         eprintf("empty delimiter\n");
          377                 fieldseplen = unescape(fieldsep);
          378                 break;
          379         case 'u':
          380                 uflag = 1;
          381                 break;
          382         default:
          383                 usage();
          384         } ARGEND
          385 
          386         /* -b shall only apply to custom key definitions */
          387         if (TAILQ_EMPTY(&kdhead) && global_flags)
          388                 addkeydef("1", global_flags & ~(MOD_STARTB | MOD_ENDB));
          389         if (TAILQ_EMPTY(&kdhead) || (!Cflag && !cflag))
          390                 addkeydef("1", global_flags & MOD_R);
          391 
          392         if (!argc) {
          393                 if (Cflag || cflag) {
          394                         if (check(stdin, "<stdin>") && !ret)
          395                                 ret = 1;
          396                 } else {
          397                         getlines(stdin, &linebuf);
          398                 }
          399         } else for (; *argv; argc--, argv++) {
          400                 if (!strcmp(*argv, "-")) {
          401                         *argv = "<stdin>";
          402                         fp = stdin;
          403                 } else if (!(fp = fopen(*argv, "r"))) {
          404                         enprintf(2, "fopen %s:", *argv);
          405                         continue;
          406                 }
          407                 if (Cflag || cflag) {
          408                         if (check(fp, *argv) && !ret)
          409                                 ret = 1;
          410                 } else {
          411                         getlines(fp, &linebuf);
          412                 }
          413                 if (fp != stdin && fshut(fp, *argv))
          414                         ret = 2;
          415         }
          416 
          417         if (!Cflag && !cflag) {
          418                 if (outfile && !(ofp = fopen(outfile, "w")))
          419                         eprintf("fopen %s:", outfile);
          420 
          421                 qsort(linebuf.lines, linebuf.nlines, sizeof(*linebuf.lines),
          422                                 (int (*)(const void *, const void *))slinecmp);
          423 
          424                 for (i = 0; i < linebuf.nlines; i++) {
          425                         if (!uflag || i == 0 ||
          426                             slinecmp(&linebuf.lines[i], &linebuf.lines[i - 1])) {
          427                                 fwrite(linebuf.lines[i].data, 1,
          428                                        linebuf.lines[i].len, ofp);
          429                         }
          430                 }
          431         }
          432 
          433         if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>") |
          434             fshut(stderr, "<stderr>"))
          435                 ret = 2;
          436 
          437         return ret;
          438 }