tfile.c - plan9port - [fork] Plan 9 from user space
 (HTM) git clone git://src.adamsgaard.dk/plan9port
 (DIR) Log
 (DIR) Files
 (DIR) Refs
 (DIR) README
 (DIR) LICENSE
       ---
       tfile.c (23931B)
       ---
            1 #include <u.h>
            2 #include <libc.h>
            3 #include <bio.h>
            4 #include <ctype.h>
            5 #include <mach.h>
            6 
            7 /*
            8  * file - determine type of file
            9  */
           10 #define        LENDIAN(p)        ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
           11 
           12 uchar        buf[6001];
           13 short        cfreq[140];
           14 short        wfreq[50];
           15 int        nbuf;
           16 Dir*        mbuf;
           17 int        fd;
           18 char         *fname;
           19 char        *slash;
           20 
           21 enum
           22 {
           23         Cword,
           24         Fword,
           25         Aword,
           26         Alword,
           27         Lword,
           28         I1,
           29         I2,
           30         I3,
           31         Clatin        = 128,
           32         Cbinary,
           33         Cnull,
           34         Ceascii,
           35         Cutf,
           36 };
           37 struct
           38 {
           39         char*        word;
           40         int        class;
           41 } dict[] =
           42 {
           43         "PATH",                Lword,
           44         "TEXT",                Aword,
           45         "adt",                Alword,
           46         "aggr",                Alword,
           47         "alef",                Alword,
           48         "array",        Lword,
           49         "block",        Fword,
           50         "chan",                Alword,
           51         "char",                Cword,
           52         "common",        Fword,
           53         "con",                Lword,
           54         "data",                Fword,
           55         "dimension",        Fword,
           56         "double",        Cword,
           57         "extern",        Cword,
           58         "bio",                I2,
           59         "float",        Cword,
           60         "fn",                Lword,
           61         "function",        Fword,
           62         "h",                I3,
           63         "implement",        Lword,
           64         "import",        Lword,
           65         "include",        I1,
           66         "int",                Cword,
           67         "integer",        Fword,
           68         "iota",                Lword,
           69         "libc",                I2,
           70         "long",                Cword,
           71         "module",        Lword,
           72         "real",                Fword,
           73         "ref",                Lword,
           74         "register",        Cword,
           75         "self",                Lword,
           76         "short",        Cword,
           77         "static",        Cword,
           78         "stdio",        I2,
           79         "struct",        Cword,
           80         "subroutine",        Fword,
           81         "u",                I2,
           82         "void",                Cword,
           83 };
           84 
           85 /* codes for 'mode' field in language structure */
           86 enum        {
           87                 Normal        = 0,
           88                 First,                /* first entry for language spanning several ranges */
           89                 Multi,                /* later entries "   "       "  ... */
           90                 Shared,                /* codes used in several languages */
           91         };
           92 
           93 struct
           94 {
           95         int        mode;                /* see enum above */
           96         int         count;
           97         int        low;
           98         int        high;
           99         char        *name;
          100 
          101 } language[] =
          102 {
          103         Normal, 0,        0x0080, 0x0080,        "Extended Latin",
          104         Normal,        0,        0x0100,        0x01FF,        "Extended Latin",
          105         Normal,        0,        0x0370,        0x03FF,        "Greek",
          106         Normal,        0,        0x0400,        0x04FF,        "Cyrillic",
          107         Normal,        0,        0x0530,        0x058F,        "Armenian",
          108         Normal,        0,        0x0590,        0x05FF,        "Hebrew",
          109         Normal,        0,        0x0600,        0x06FF,        "Arabic",
          110         Normal,        0,        0x0900,        0x097F,        "Devanagari",
          111         Normal,        0,        0x0980,        0x09FF,        "Bengali",
          112         Normal,        0,        0x0A00,        0x0A7F,        "Gurmukhi",
          113         Normal,        0,        0x0A80,        0x0AFF,        "Gujarati",
          114         Normal,        0,        0x0B00,        0x0B7F,        "Oriya",
          115         Normal,        0,        0x0B80,        0x0BFF,        "Tamil",
          116         Normal,        0,        0x0C00,        0x0C7F,        "Telugu",
          117         Normal,        0,        0x0C80,        0x0CFF,        "Kannada",
          118         Normal,        0,        0x0D00,        0x0D7F,        "Malayalam",
          119         Normal,        0,        0x0E00,        0x0E7F,        "Thai",
          120         Normal,        0,        0x0E80,        0x0EFF,        "Lao",
          121         Normal,        0,        0x1000,        0x105F,        "Tibetan",
          122         Normal,        0,        0x10A0,        0x10FF,        "Georgian",
          123         Normal,        0,        0x3040,        0x30FF,        "Japanese",
          124         Normal,        0,        0x3100,        0x312F,        "Chinese",
          125         First,        0,        0x3130,        0x318F,        "Korean",
          126         Multi,        0,        0x3400,        0x3D2F,        "Korean",
          127         Shared,        0,        0x4e00,        0x9fff,        "CJK",
          128         Normal,        0,        0,        0,        0,                /* terminal entry */
          129 };
          130 
          131 
          132 enum
          133 {
          134         Fascii,                /* printable ascii */
          135         Flatin,                /* latin 1*/
          136         Futf,                /* UTf character set */
          137         Fbinary,        /* binary */
          138         Feascii,        /* ASCII with control chars */
          139         Fnull,                /* NULL in file */
          140 } guess;
          141 
          142 void        bump_utf_count(Rune);
          143 int        cistrncmp(char*, char*, int);
          144 void        filetype(int);
          145 int        getfontnum(uchar*, uchar**);
          146 int        isas(void);
          147 int        isc(void);
          148 int        isenglish(void);
          149 int        ishp(void);
          150 int        ishtml(void);
          151 int        isrfc822(void);
          152 int        ismbox(void);
          153 int        islimbo(void);
          154 int        ismung(void);
          155 int        isp9bit(void);
          156 int        isp9font(void);
          157 int        isrtf(void);
          158 int        ismsdos(void);
          159 int        iself(void);
          160 int        istring(void);
          161 int        iff(void);
          162 int        long0(void);
          163 int        istar(void);
          164 int        p9bitnum(uchar*);
          165 int        p9subfont(uchar*);
          166 void        print_utf(void);
          167 void        type(char*, int);
          168 int        utf_count(void);
          169 void        wordfreq(void);
          170 
          171 int        (*call[])(void) =
          172 {
          173         long0,                /* recognizable by first 4 bytes */
          174         istring,        /* recognizable by first string */
          175         iff,                /* interchange file format (strings) */
          176         isrfc822,        /* email file */
          177         ismbox,                /* mail box */
          178         istar,                /* recognizable by tar checksum */
          179         ishtml,                /* html keywords */
          180 /*        iscint,                /* compiler/assembler intermediate */
          181         islimbo,        /* limbo source */
          182         isc,                /* c & alef compiler key words */
          183         isas,                /* assembler key words */
          184         ismung,                /* entropy compressed/encrypted */
          185         isp9font,        /* plan 9 font */
          186         isp9bit,        /* plan 9 image (as from /dev/window) */
          187         isenglish,        /* char frequency English */
          188         isrtf,                /* rich text format */
          189         ismsdos,        /* msdos exe (virus file attachement) */
          190         iself,                /* ELF (foreign) executable */
          191         0
          192 };
          193 
          194 int mime;
          195 
          196 #define OCTET        "application/octet-stream\n"
          197 #define PLAIN        "text/plain\n"
          198 
          199 void
          200 main(int argc, char *argv[])
          201 {
          202         int i, j, maxlen;
          203         char *cp;
          204         Rune r;
          205 
          206         ARGBEGIN{
          207         case 'm':
          208                 mime = 1;
          209                 break;
          210         default:
          211                 fprint(2, "usage: file [-m] [file...]\n");
          212                 exits("usage");
          213         }ARGEND;
          214 
          215         maxlen = 0;
          216         if(mime == 0 || argc > 1){
          217                 for(i = 0; i < argc; i++) {
          218                         for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
          219                                         ;
          220                         if(j > maxlen)
          221                                 maxlen = j;
          222                 }
          223         }
          224         if (argc <= 0) {
          225                 if(!mime)
          226                         print ("stdin: ");
          227                 filetype(0);
          228         }
          229         else {
          230                 for(i = 0; i < argc; i++)
          231                         type(argv[i], maxlen);
          232         }
          233         exits(0);
          234 }
          235 
          236 void
          237 type(char *file, int nlen)
          238 {
          239         Rune r;
          240         int i;
          241         char *p;
          242 
          243         if(nlen > 0){
          244                 slash = 0;
          245                 for (i = 0, p = file; *p; i++) {
          246                         if (*p == '/')                        /* find rightmost slash */
          247                                 slash = p;
          248                         p += chartorune(&r, p);                /* count runes */
          249                 }
          250                 print("%s:%*s",file, nlen-i+1, "");
          251         }
          252         fname = file;
          253         if ((fd = open(file, OREAD)) < 0) {
          254                 print("cannot open\n");
          255                 return;
          256         }
          257         filetype(fd);
          258         close(fd);
          259 }
          260 
          261 void
          262 filetype(int fd)
          263 {
          264         Rune r;
          265         int i, f, n;
          266         char *p, *eob;
          267 
          268         free(mbuf);
          269         mbuf = dirfstat(fd);
          270         if(mbuf == nil){
          271                 print("cannot stat: %r\n");
          272                 return;
          273         }
          274         if(mbuf->mode & DMDIR) {
          275                 print(mime ? "text/directory\n" : "directory\n");
          276                 return;
          277         }
          278         if(mbuf->type != 'M' && mbuf->type != '|') {
          279                 print(mime ? OCTET : "special file #%c/%s\n",
          280                         mbuf->type, mbuf->name);
          281                 return;
          282         }
          283         nbuf = read(fd, buf, sizeof(buf)-1);
          284 
          285         if(nbuf < 0) {
          286                 print("cannot read\n");
          287                 return;
          288         }
          289         if(nbuf == 0) {
          290                 print(mime ? PLAIN : "empty file\n");
          291                 return;
          292         }
          293         buf[nbuf] = 0;
          294 
          295         /*
          296          * build histogram table
          297          */
          298         memset(cfreq, 0, sizeof(cfreq));
          299         for (i = 0; language[i].name; i++)
          300                 language[i].count = 0;
          301         eob = (char *)buf+nbuf;
          302         for(n = 0, p = (char *)buf; p < eob; n++) {
          303                 if (!fullrune(p, eob-p) && eob-p < UTFmax)
          304                         break;
          305                 p += chartorune(&r, p);
          306                 if (r == 0)
          307                         f = Cnull;
          308                 else if (r <= 0x7f) {
          309                         if (!isprint(r) && !isspace(r))
          310                                 f = Ceascii;        /* ASCII control char */
          311                         else f = r;
          312                 } else if (r == 0x080) {
          313                         bump_utf_count(r);
          314                         f = Cutf;
          315                 } else if (r < 0xA0)
          316                                 f = Cbinary;        /* Invalid Runes */
          317                 else if (r <= 0xff)
          318                                 f = Clatin;        /* Latin 1 */
          319                 else {
          320                         bump_utf_count(r);
          321                         f = Cutf;                /* UTF extension */
          322                 }
          323                 cfreq[f]++;                        /* ASCII chars peg directly */
          324         }
          325         /*
          326          * gross classify
          327          */
          328         if (cfreq[Cbinary])
          329                 guess = Fbinary;
          330         else if (cfreq[Cutf])
          331                 guess = Futf;
          332         else if (cfreq[Clatin])
          333                 guess = Flatin;
          334         else if (cfreq[Ceascii])
          335                 guess = Feascii;
          336         else if (cfreq[Cnull] == n) {
          337                 print(mime ? OCTET : "first block all null bytes\n");
          338                 return;
          339         }
          340         else guess = Fascii;
          341         /*
          342          * lookup dictionary words
          343          */
          344         memset(wfreq, 0, sizeof(wfreq));
          345         if(guess == Fascii || guess == Flatin || guess == Futf)
          346                 wordfreq();
          347         /*
          348          * call individual classify routines
          349          */
          350         for(i=0; call[i]; i++)
          351                 if((*call[i])())
          352                         return;
          353 
          354         /*
          355          * if all else fails,
          356          * print out gross classification
          357          */
          358         if (nbuf < 100 && !mime)
          359                 print(mime ? PLAIN : "short ");
          360         if (guess == Fascii)
          361                 print(mime ? PLAIN : "Ascii\n");
          362         else if (guess == Feascii)
          363                 print(mime ? PLAIN : "extended ascii\n");
          364         else if (guess == Flatin)
          365                 print(mime ? PLAIN : "latin ascii\n");
          366         else if (guess == Futf && utf_count() < 4)
          367                 print_utf();
          368         else print(mime ? OCTET : "binary\n");
          369 }
          370 
          371 void
          372 bump_utf_count(Rune r)
          373 {
          374         int low, high, mid;
          375 
          376         high = sizeof(language)/sizeof(language[0])-1;
          377         for (low = 0; low < high;) {
          378                 mid = (low+high)/2;
          379                 if (r >=language[mid].low) {
          380                         if (r <= language[mid].high) {
          381                                 language[mid].count++;
          382                                 break;
          383                         } else low = mid+1;
          384                 } else high = mid;
          385         }
          386 }
          387 
          388 int
          389 utf_count(void)
          390 {
          391         int i, count;
          392 
          393         count = 0;
          394         for (i = 0; language[i].name; i++)
          395                 if (language[i].count > 0)
          396                         switch (language[i].mode) {
          397                         case Normal:
          398                         case First:
          399                                 count++;
          400                                 break;
          401                         default:
          402                                 break;
          403                         }
          404         return count;
          405 }
          406 
          407 int
          408 chkascii(void)
          409 {
          410         int i;
          411 
          412         for (i = 'a'; i < 'z'; i++)
          413                 if (cfreq[i])
          414                         return 1;
          415         for (i = 'A'; i < 'Z'; i++)
          416                 if (cfreq[i])
          417                         return 1;
          418         return 0;
          419 }
          420 
          421 int
          422 find_first(char *name)
          423 {
          424         int i;
          425 
          426         for (i = 0; language[i].name != 0; i++)
          427                 if (language[i].mode == First
          428                         && strcmp(language[i].name, name) == 0)
          429                         return i;
          430         return -1;
          431 }
          432 
          433 void
          434 print_utf(void)
          435 {
          436         int i, printed, j;
          437 
          438         if(mime){
          439                 print(PLAIN);
          440                 return;
          441         }
          442         if (chkascii()) {
          443                 printed = 1;
          444                 print("Ascii");
          445         } else
          446                 printed = 0;
          447         for (i = 0; language[i].name; i++)
          448                 if (language[i].count) {
          449                         switch(language[i].mode) {
          450                         case Multi:
          451                                 j = find_first(language[i].name);
          452                                 if (j < 0)
          453                                         break;
          454                                 if (language[j].count > 0)
          455                                         break;
          456                                 /* Fall through */
          457                         case Normal:
          458                         case First:
          459                                 if (printed)
          460                                         print(" & ");
          461                                 else printed = 1;
          462                                 print("%s", language[i].name);
          463                                 break;
          464                         case Shared:
          465                         default:
          466                                 break;
          467                         }
          468                 }
          469         if(!printed)
          470                 print("UTF");
          471         print(" text\n");
          472 }
          473 
          474 void
          475 wordfreq(void)
          476 {
          477         int low, high, mid, r;
          478         uchar *p, *p2, c;
          479 
          480         p = buf;
          481         for(;;) {
          482                 while (p < buf+nbuf && !isalpha(*p))
          483                         p++;
          484                 if (p >= buf+nbuf)
          485                         return;
          486                 p2 = p;
          487                 while(p < buf+nbuf && isalpha(*p))
          488                         p++;
          489                 c = *p;
          490                 *p = 0;
          491                 high = sizeof(dict)/sizeof(dict[0]);
          492                 for(low = 0;low < high;) {
          493                         mid = (low+high)/2;
          494                         r = strcmp(dict[mid].word, (char*)p2);
          495                         if(r == 0) {
          496                                 wfreq[dict[mid].class]++;
          497                                 break;
          498                         }
          499                         if(r < 0)
          500                                 low = mid+1;
          501                         else
          502                                 high = mid;
          503                 }
          504                 *p++ = c;
          505         }
          506 }
          507 
          508 typedef struct Filemagic Filemagic;
          509 struct Filemagic {
          510         ulong x;
          511         ulong mask;
          512         char *desc;
          513         char *mime;
          514 };
          515 
          516 Filemagic long0tab[] = {
          517         0xF16DF16D,        0xFFFFFFFF,        "pac1 audio file\n",        OCTET,
          518         0x31636170,        0xFFFFFFFF,        "pac3 audio file\n",        OCTET,
          519         0x32636170,        0xFFFF00FF,        "pac4 audio file\n",        OCTET,
          520         0xBA010000,        0xFFFFFFFF,        "mpeg system stream\n",        OCTET,
          521         0x30800CC0,        0xFFFFFFFF,        "inferno .dis executable\n", OCTET,
          522         0x04034B50,        0xFFFFFFFF,        "zip archive\n", "application/zip\n",
          523         070707,                0xFFFF,                "cpio archive\n", OCTET,
          524         0x2F7,                0xFFFF,                "tex dvi\n", "application/dvi\n",
          525         0xfffa0000,        0xfffe0000,        "mp3 audio\n",        "audio/mpeg\n",
          526         0xcafebabe,        0xFFFFFFFF,        "Mach-O fat executable\n",        "application/x-mach-binary\n",
          527         0xfeedface,        0xFFFFFFFE,        "Mach-O executable\n",        "application/x-mach-binary\n",
          528         0xbebafeca,        0xFFFFFFFF,        "Java class\n",        "application/x-java-applet\n",
          529 };
          530 
          531 int
          532 filemagic(Filemagic *tab, int ntab, ulong x)
          533 {
          534         int i;
          535 
          536         for(i=0; i<ntab; i++)
          537                 if((x&tab[i].mask) == tab[i].x){
          538                         print(mime ? tab[i].mime : tab[i].desc);
          539                         return 1;
          540                 }
          541         return 0;
          542 }
          543 
          544 int
          545 long0(void)
          546 {
          547 /*        Fhdr *f; */
          548         long x;
          549 
          550         seek(fd, 0, 0);                /* reposition to start of file */
          551 /*
          552         if(crackhdr(fd, &f)) {
          553                 print(mime ? OCTET : "%s\n", f.name);
          554                 return 1;
          555         }
          556 */
          557         x = LENDIAN(buf);
          558         if(filemagic(long0tab, nelem(long0tab), x))
          559                 return 1;
          560         return 0;
          561 }
          562 
          563 /* from tar.c */
          564 enum { NAMSIZ = 100, TBLOCK = 512 };
          565 
          566 union        hblock
          567 {
          568         char        dummy[TBLOCK];
          569         struct        header
          570         {
          571                 char        name[NAMSIZ];
          572                 char        mode[8];
          573                 char        uid[8];
          574                 char        gid[8];
          575                 char        size[12];
          576                 char        mtime[12];
          577                 char        chksum[8];
          578                 char        linkflag;
          579                 char        linkname[NAMSIZ];
          580                 /* rest are defined by POSIX's ustar format; see p1003.2b */
          581                 char        magic[6];        /* "ustar" */
          582                 char        version[2];
          583                 char        uname[32];
          584                 char        gname[32];
          585                 char        devmajor[8];
          586                 char        devminor[8];
          587                 char        prefix[155];  /* if non-null, path = prefix "/" name */
          588         } dbuf;
          589 };
          590 
          591 int
          592 checksum(union hblock *hp)
          593 {
          594         int i;
          595         char *cp;
          596         struct header *hdr = &hp->dbuf;
          597 
          598         for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
          599                 *cp = ' ';
          600         i = 0;
          601         for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
          602                 i += *cp & 0xff;
          603         return i;
          604 }
          605 
          606 int
          607 istar(void)
          608 {
          609         int chksum;
          610         char tblock[TBLOCK];
          611         union hblock *hp = (union hblock *)tblock;
          612         struct header *hdr = &hp->dbuf;
          613 
          614         seek(fd, 0, 0);                /* reposition to start of file */
          615         if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
          616                 return 0;
          617         chksum = strtol(hdr->chksum, 0, 8);
          618         if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
          619                 if (strcmp(hdr->magic, "ustar") == 0)
          620                         print(mime? "application/x-ustar\n":
          621                                 "posix tar archive\n");
          622                 else
          623                         print(mime? "application/x-tar\n": "tar archive\n");
          624                 return 1;
          625         }
          626         return 0;
          627 }
          628 
          629 /*
          630  * initial words to classify file
          631  */
          632 struct        FILE_STRING
          633 {
          634         char         *key;
          635         char        *filetype;
          636         int        length;
          637         char        *mime;
          638 } file_string[] =
          639 {
          640         "!<arch>\n__.SYMDEF",        "archive random library",        16,        "application/octet-stream",
          641         "!<arch>\n",                "archive",                        8,        "application/octet-stream",
          642         "070707",                "cpio archive - ascii header",        6,        "application/octet-stream",
          643         "%!",                        "postscript",                        2,        "application/postscript",
          644         "\004%!",                "postscript",                        3,        "application/postscript",
          645         "x T post",                "troff output for post",        8,        "application/troff",
          646         "x T Latin1",                "troff output for Latin1",        10,        "application/troff",
          647         "x T utf",                "troff output for UTF",                7,        "application/troff",
          648         "x T 202",                "troff output for 202",                7,        "application/troff",
          649         "x T aps",                "troff output for aps",                7,        "application/troff",
          650         "GIF",                        "GIF image",                         3,        "image/gif",
          651         "\0PC Research, Inc\0",        "ghostscript fax file",                18,        "application/ghostscript",
          652         "%PDF",                        "PDF",                                4,        "application/pdf",
          653         "<html>\n",                "HTML file",                        7,        "text/html",
          654         "<HTML>\n",                "HTML file",                        7,        "text/html",
          655         "compressed\n",                "Compressed image or subfont",        11,        "application/octet-stream",
          656         "\111\111\052\000",        "tiff",                                4,        "image/tiff",
          657         "\115\115\000\052",        "tiff",                                4,        "image/tiff",
          658         "\377\330\377\340",        "jpeg",                                4,        "image/jpeg",
          659         "\377\330\377\341",        "jpeg",                                4,        "image/jpeg",
          660         "\377\330\377\333",        "jpeg",                                4,        "image/jpeg",
          661         "\106\117\126\142",        "x3f",                                4,        "image/x3f",
          662         "BM",                        "bmp",                                2,        "image/bmp",
          663         "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1",        "microsoft office document",        8,        "application/octet-stream",
          664         "<MakerFile ",                "FrameMaker file",                11,        "application/framemaker",
          665         "\033%-12345X",        "HPJCL file",                9,        "application/hpjcl",
          666         "ID3",                        "mp3 audio with id3",        3,        "audio/mpeg",
          667         0,0,0,0
          668 };
          669 
          670 int
          671 istring(void)
          672 {
          673         int i, j;
          674         struct FILE_STRING *p;
          675 
          676         for(p = file_string; p->key; p++) {
          677                 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
          678                         if(mime)
          679                                 print("%s\n", p->mime);
          680                         else
          681                                 print("%s\n", p->filetype);
          682                         return 1;
          683                 }
          684         }
          685         if(strncmp((char*)buf, "TYPE=", 5) == 0) {        /* td */
          686                 for(i = 5; i < nbuf; i++)
          687                         if(buf[i] == '\n')
          688                                 break;
          689                 if(mime)
          690                         print(OCTET);
          691                 else
          692                         print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
          693                 return 1;
          694         }
          695         if(buf[0]=='#' && buf[1]=='!'){
          696                 i=2;
          697                 for(j=2; j < nbuf && buf[j] != ' ' && buf[j] != '\n' && buf[j] != '\r'; j++)
          698                         if(buf[j] == '/')
          699                                 i = j+1;
          700                 if(mime)
          701                         print(PLAIN);
          702                 else
          703                         print("%.*s executable file script\n", utfnlen((char*)buf+i, j-i), (char*)buf+i);
          704                 return 1;
          705         }
          706         return 0;
          707 }
          708 
          709 int
          710 iff(void)
          711 {
          712         if (strncmp((char*)buf, "FORM", 4) == 0 &&
          713             strncmp((char*)buf+8, "AIFF", 4) == 0) {
          714                 print("%s\n", mime? "audio/x-aiff": "aiff audio");
          715                 return 1;
          716         }
          717         return 0;
          718 }
          719 
          720 char*        html_string[] =
          721 {
          722         "title",
          723         "body",
          724         "head",
          725         "strong",
          726         "h1",
          727         "h2",
          728         "h3",
          729         "h4",
          730         "h5",
          731         "h6",
          732         "ul",
          733         "li",
          734         "dl",
          735         "br",
          736         "em",
          737         0,
          738 };
          739 
          740 int
          741 ishtml(void)
          742 {
          743         uchar *p, *q;
          744         int i, count;
          745 
          746                 /* compare strings between '<' and '>' to html table */
          747         count = 0;
          748         p = buf;
          749         for(;;) {
          750                 while (p < buf+nbuf && *p != '<')
          751                         p++;
          752                 p++;
          753                 if (p >= buf+nbuf)
          754                         break;
          755                 if(*p == '/')
          756                         p++;
          757                 q = p;
          758                 while(p < buf+nbuf && *p != '>')
          759                         p++;
          760                 if (p >= buf+nbuf)
          761                         break;
          762                 for(i = 0; html_string[i]; i++) {
          763                         if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
          764                                 if(count++ > 4) {
          765                                         print(mime ? "text/html\n" : "HTML file\n");
          766                                         return 1;
          767                                 }
          768                                 break;
          769                         }
          770                 }
          771                 p++;
          772         }
          773         return 0;
          774 }
          775 
          776 char*        rfc822_string[] =
          777 {
          778         "from:",
          779         "date:",
          780         "to:",
          781         "subject:",
          782         "received:",
          783         "reply to:",
          784         "sender:",
          785         0,
          786 };
          787 
          788 int
          789 isrfc822(void)
          790 {
          791 
          792         char *p, *q, *r;
          793         int i, count;
          794 
          795         count = 0;
          796         p = (char*)buf;
          797         for(;;) {
          798                 q = strchr(p, '\n');
          799                 if(q == nil)
          800                         break;
          801                 *q = 0;
          802                 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
          803                         count++;
          804                         *q = '\n';
          805                         p = q+1;
          806                         continue;
          807                 }
          808                 *q = '\n';
          809                 if(*p != '\t' && *p != ' '){
          810                         r = strchr(p, ':');
          811                         if(r == 0 || r > q)
          812                                 break;
          813                         for(i = 0; rfc822_string[i]; i++) {
          814                                 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
          815                                         count++;
          816                                         break;
          817                                 }
          818                         }
          819                 }
          820                 p = q+1;
          821         }
          822         if(count >= 3){
          823                 print(mime ? "message/rfc822\n" : "email file\n");
          824                 return 1;
          825         }
          826         return 0;
          827 }
          828 
          829 int
          830 ismbox(void)
          831 {
          832         char *p, *q;
          833 
          834         p = (char*)buf;
          835         q = strchr(p, '\n');
          836         if(q == nil)
          837                 return 0;
          838         *q = 0;
          839         if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
          840                 print(mime ? "text/plain\n" : "mail box\n");
          841                 return 1;
          842         }
          843         *q = '\n';
          844         return 0;
          845 }
          846 
          847 int
          848 isc(void)
          849 {
          850         int n;
          851 
          852         n = wfreq[I1];
          853         /*
          854          * includes
          855          */
          856         if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
          857                 goto yes;
          858         if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
          859                 goto yes;
          860         /*
          861          * declarations
          862          */
          863         if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
          864                 goto yes;
          865         /*
          866          * assignments
          867          */
          868         if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
          869                 goto yes;
          870         return 0;
          871 
          872 yes:
          873         if(mime){
          874                 print(PLAIN);
          875                 return 1;
          876         }
          877         if(wfreq[Alword] > 0)
          878                 print("alef program\n");
          879         else
          880                 print("c program\n");
          881         return 1;
          882 }
          883 
          884 int
          885 islimbo(void)
          886 {
          887 
          888         /*
          889          * includes
          890          */
          891         if(wfreq[Lword] < 4)
          892                 return 0;
          893         print(mime ? PLAIN : "limbo program\n");
          894         return 1;
          895 }
          896 
          897 int
          898 isas(void)
          899 {
          900 
          901         /*
          902          * includes
          903          */
          904         if(wfreq[Aword] < 2)
          905                 return 0;
          906         print(mime ? PLAIN : "as program\n");
          907         return 1;
          908 }
          909 
          910 /*
          911  * low entropy means encrypted
          912  */
          913 int
          914 ismung(void)
          915 {
          916         int i, bucket[8];
          917         float cs;
          918 
          919         if(nbuf < 64)
          920                 return 0;
          921         memset(bucket, 0, sizeof(bucket));
          922         for(i=0; i<64; i++)
          923                 bucket[(buf[i]>>5)&07] += 1;
          924 
          925         cs = 0.;
          926         for(i=0; i<8; i++)
          927                 cs += (bucket[i]-8)*(bucket[i]-8);
          928         cs /= 8.;
          929         if(cs <= 24.322) {
          930                 if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
          931                         print(mime ? OCTET : "compressed\n");
          932                 else
          933                         print(mime ? OCTET : "encrypted\n");
          934                 return 1;
          935         }
          936         return 0;
          937 }
          938 
          939 /*
          940  * english by punctuation and frequencies
          941  */
          942 int
          943 isenglish(void)
          944 {
          945         int vow, comm, rare, badpun, punct;
          946         char *p;
          947 
          948         if(guess != Fascii && guess != Feascii)
          949                 return 0;
          950         badpun = 0;
          951         punct = 0;
          952         for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
          953                 switch(*p) {
          954                 case '.':
          955                 case ',':
          956                 case ')':
          957                 case '%':
          958                 case ';':
          959                 case ':':
          960                 case '?':
          961                         punct++;
          962                         if(p[1] != ' ' && p[1] != '\n')
          963                                 badpun++;
          964                 }
          965         if(badpun*5 > punct)
          966                 return 0;
          967         if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e'])        /* shell file test */
          968                 return 0;
          969         if(2*cfreq[';'] > cfreq['e'])
          970                 return 0;
          971 
          972         vow = 0;
          973         for(p="AEIOU"; *p; p++) {
          974                 vow += cfreq[(uchar)*p];
          975                 vow += cfreq[tolower((uchar)*p)];
          976         }
          977         comm = 0;
          978         for(p="ETAION"; *p; p++) {
          979                 comm += cfreq[(uchar)*p];
          980                 comm += cfreq[tolower((uchar)*p)];
          981         }
          982         rare = 0;
          983         for(p="VJKQXZ"; *p; p++) {
          984                 rare += cfreq[(uchar)*p];
          985                 rare += cfreq[tolower((uchar)*p)];
          986         }
          987         if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
          988                 print(mime ? PLAIN : "English text\n");
          989                 return 1;
          990         }
          991         return 0;
          992 }
          993 
          994 /*
          995  * pick up a number with
          996  * syntax _*[0-9]+_
          997  */
          998 #define        P9BITLEN        12
          999 int
         1000 p9bitnum(uchar *bp)
         1001 {
         1002         int n, c, len;
         1003 
         1004         len = P9BITLEN;
         1005         while(*bp == ' ') {
         1006                 bp++;
         1007                 len--;
         1008                 if(len <= 0)
         1009                         return -1;
         1010         }
         1011         n = 0;
         1012         while(len > 1) {
         1013                 c = *bp++;
         1014                 if(!isdigit(c))
         1015                         return -1;
         1016                 n = n*10 + c-'0';
         1017                 len--;
         1018         }
         1019         if(*bp != ' ')
         1020                 return -1;
         1021         return n;
         1022 }
         1023 
         1024 int
         1025 depthof(char *s, int *newp)
         1026 {
         1027         char *es;
         1028         int d;
         1029 
         1030         *newp = 0;
         1031         es = s+12;
         1032         while(s<es && *s==' ')
         1033                 s++;
         1034         if(s == es)
         1035                 return -1;
         1036         if('0'<=*s && *s<='9')
         1037                 return 1<<atoi(s);
         1038 
         1039         *newp = 1;
         1040         d = 0;
         1041         while(s<es && *s!=' '){
         1042                 s++;        /* skip letter */
         1043                 d += strtoul(s, &s, 10);
         1044         }
         1045 
         1046         switch(d){
         1047         case 32:
         1048         case 24:
         1049         case 16:
         1050         case 8:
         1051                 return d;
         1052         }
         1053         return -1;
         1054 }
         1055 
         1056 int
         1057 isp9bit(void)
         1058 {
         1059         int dep, lox, loy, hix, hiy, px, new;
         1060         ulong t;
         1061         long len;
         1062         char *newlabel;
         1063 
         1064         newlabel = "old ";
         1065 
         1066         dep = depthof((char*)buf + 0*P9BITLEN, &new);
         1067         if(new)
         1068                 newlabel = "";
         1069         lox = p9bitnum(buf + 1*P9BITLEN);
         1070         loy = p9bitnum(buf + 2*P9BITLEN);
         1071         hix = p9bitnum(buf + 3*P9BITLEN);
         1072         hiy = p9bitnum(buf + 4*P9BITLEN);
         1073         if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
         1074                 return 0;
         1075 
         1076         if(dep < 8){
         1077                 px = 8/dep;        /* pixels per byte */
         1078                 /* set l to number of bytes of data per scan line */
         1079                 if(lox >= 0)
         1080                         len = (hix+px-1)/px - lox/px;
         1081                 else{        /* make positive before divide */
         1082                         t = (-lox)+px-1;
         1083                         t = (t/px)*px;
         1084                         len = (t+hix+px-1)/px;
         1085                 }
         1086         }else
         1087                 len = (hix-lox)*dep/8;
         1088         len *= (hiy-loy);                /* col length */
         1089         len += 5*P9BITLEN;                /* size of initial ascii */
         1090 
         1091         /*
         1092          * for image file, length is non-zero and must match calculation above
         1093          * for /dev/window and /dev/screen the length is always zero
         1094          * for subfont, the subfont header should follow immediately.
         1095          */
         1096         if (len != 0 && mbuf->length == 0) {
         1097                 print("%splan 9 image\n", newlabel);
         1098                 return 1;
         1099         }
         1100         if (mbuf->length == len) {
         1101                 print("%splan 9 image\n", newlabel);
         1102                 return 1;
         1103         }
         1104         /* Ghostscript sometimes produces a little extra on the end */
         1105         if (mbuf->length < len+P9BITLEN) {
         1106                 print("%splan 9 image\n", newlabel);
         1107                 return 1;
         1108         }
         1109         if (p9subfont(buf+len)) {
         1110                 print("%ssubfont file\n", newlabel);
         1111                 return 1;
         1112         }
         1113         return 0;
         1114 }
         1115 
         1116 int
         1117 p9subfont(uchar *p)
         1118 {
         1119         int n, h, a;
         1120 
         1121                 /* if image too big, assume it's a subfont */
         1122         if (p+3*P9BITLEN > buf+sizeof(buf))
         1123                 return 1;
         1124 
         1125         n = p9bitnum(p + 0*P9BITLEN);        /* char count */
         1126         if (n < 0)
         1127                 return 0;
         1128         h = p9bitnum(p + 1*P9BITLEN);        /* height */
         1129         if (h < 0)
         1130                 return 0;
         1131         a = p9bitnum(p + 2*P9BITLEN);        /* ascent */
         1132         if (a < 0)
         1133                 return 0;
         1134         return 1;
         1135 }
         1136 
         1137 #define        WHITESPACE(c)                ((c) == ' ' || (c) == '\t' || (c) == '\n')
         1138 
         1139 int
         1140 isp9font(void)
         1141 {
         1142         uchar *cp, *p;
         1143         int i, n;
         1144         char pathname[1024];
         1145 
         1146         cp = buf;
         1147         if (!getfontnum(cp, &cp))        /* height */
         1148                 return 0;
         1149         if (!getfontnum(cp, &cp))        /* ascent */
         1150                 return 0;
         1151         for (i = 0; 1; i++) {
         1152                 if (!getfontnum(cp, &cp))        /* min */
         1153                         break;
         1154                 if (!getfontnum(cp, &cp))        /* max */
         1155                         return 0;
         1156                 while (WHITESPACE(*cp))
         1157                         cp++;
         1158                 for (p = cp; *cp && !WHITESPACE(*cp); cp++)
         1159                                 ;
         1160                         /* construct a path name, if needed */
         1161                 n = 0;
         1162                 if (*p != '/' && slash) {
         1163                         n = slash-fname+1;
         1164                         if (n < sizeof(pathname))
         1165                                 memcpy(pathname, fname, n);
         1166                         else n = 0;
         1167                 }
         1168                 if (n+cp-p < sizeof(pathname)) {
         1169                         memcpy(pathname+n, p, cp-p);
         1170                         n += cp-p;
         1171                         pathname[n] = 0;
         1172                         if (access(pathname, AEXIST) < 0)
         1173                                 return 0;
         1174                 }
         1175         }
         1176         if (i) {
         1177                 print(mime ? "text/plain\n" : "font file\n");
         1178                 return 1;
         1179         }
         1180         return 0;
         1181 }
         1182 
         1183 int
         1184 getfontnum(uchar *cp, uchar **rp)
         1185 {
         1186         while (WHITESPACE(*cp))                /* extract ulong delimited by whitespace */
         1187                 cp++;
         1188         if (*cp < '0' || *cp > '9')
         1189                 return 0;
         1190         strtoul((char *)cp, (char **)rp, 0);
         1191         if (!WHITESPACE(**rp))
         1192                 return 0;
         1193         return 1;
         1194 }
         1195 
         1196 int
         1197 isrtf(void)
         1198 {
         1199         if(strstr((char *)buf, "\\rtf1")){
         1200                 print(mime ? "application/rtf\n" : "rich text format\n");
         1201                 return 1;
         1202         }
         1203         return 0;
         1204 }
         1205 
         1206 int
         1207 ismsdos(void)
         1208 {
         1209         if (buf[0] == 0x4d && buf[1] == 0x5a){
         1210                 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
         1211                 return 1;
         1212         }
         1213         return 0;
         1214 }
         1215 
         1216 int
         1217 iself(void)
         1218 {
         1219         static char *cpu[] = {                /* NB: incomplete and arbitary list */
         1220                 nil,
         1221         /*1*/        "WE32100",
         1222         /*2*/        "SPARC",
         1223         /*3*/        "i386",
         1224         /*4*/        "M68000",
         1225         /*5*/        "M88000",
         1226         /*6*/        "i486",
         1227         /*7*/        "i860",
         1228         /*8*/        "R3000",
         1229         /*9*/        "S370",
         1230         /*10*/        "R4000",
         1231                 nil, nil, nil, nil,
         1232         /*15*/        "HP-PA",
         1233                 nil,
         1234                 nil,
         1235         /*18*/        "sparc v8+",
         1236         /*19*/        "i960",
         1237         /*20*/        "PPC-32",
         1238         /*21*/        "PPC-64",
         1239                 nil, nil, nil, nil,
         1240                 nil, nil, nil, nil, nil,
         1241                 nil, nil, nil, nil, nil,
         1242                 nil, nil, nil, nil,
         1243         /*40*/        "ARM",
         1244         /*41*/        "Alpha",
         1245                 nil,
         1246         /*43*/        "sparc v9",
         1247                 nil, nil,
         1248                 nil, nil, nil, nil,
         1249         /*50*/        "IA-64",
         1250                 nil, nil, nil, nil, nil,
         1251                 nil, nil, nil, nil, nil,
         1252                 nil,
         1253         /*62*/        "AMD64",
         1254                 nil, nil, nil,
         1255                 nil, nil, nil, nil, nil,
         1256                 nil, nil, nil, nil,
         1257         /*75*/        "VAX",
         1258         };
         1259 
         1260 
         1261         if (memcmp(buf, "\177ELF", 4) == 0){
         1262                 /* gcc misparses \x7FELF as \x7FE L F */
         1263                 if (!mime){
         1264                         int n = (buf[19] << 8) | buf[18];
         1265                         char *p = "unknown";
         1266 
         1267                         if (n > 0 && n < nelem(cpu) && cpu[n])
         1268                                 p = cpu[n];
         1269                         else {
         1270                                 /* try the other byte order */
         1271                                 n = (buf[18] << 8) | buf[19];
         1272                                 if (n > 0 && n < nelem(cpu) && cpu[n])
         1273                                         p = cpu[n];
         1274                         }
         1275                         print("%s ELF executable\n", p);
         1276                 }
         1277                 else
         1278                         print("application/x-elf-executable");
         1279                 return 1;
         1280         }
         1281 
         1282         return 0;
         1283 }