smu.c - smu - smu - simple markup (Markdown) processor (fork, fixes + features)
(HTM) git clone git://git.codemadness.org/smu
(DIR) Log
(DIR) Files
(DIR) Refs
(DIR) README
(DIR) LICENSE
---
smu.c (17096B)
---
1 #include <ctype.h>
2 #include <errno.h>
3 #include <stdarg.h>
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
7
8 #ifdef __OpenBSD__
9 #include <unistd.h>
10 #else
11 #define pledge(p1,p2) 0
12 #endif
13
14 #define LENGTH(x) sizeof(x)/sizeof(x[0])
15 #define ADDC(b,i) if (i % BUFSIZ == 0) { b = realloc(b, (i + BUFSIZ)); if (!b) eprint("realloc"); } b[i]
16
17 typedef int (*Parser)(const char *, const char *, int);
18 typedef struct {
19 char *search;
20 int process;
21 char *before, *after;
22 } Tag;
23
24 static int doamp(const char *begin, const char *end, int newblock); /* Parser for & */
25 static int docomment(const char *begin, const char *end, int newblock); /* Parser for html-comments */
26 static int dogtlt(const char *begin, const char *end, int newblock); /* Parser for < and > */
27 static int dohtml(const char *begin, const char *end, int newblock); /* Parser for html */
28 static int dolineprefix(const char *begin, const char *end, int newblock);/* Parser for line prefix tags */
29 static int dolink(const char *begin, const char *end, int newblock); /* Parser for links and images */
30 static int dolist(const char *begin, const char *end, int newblock); /* Parser for lists */
31 static int doparagraph(const char *begin, const char *end, int newblock); /* Parser for paragraphs */
32 static int doreplace(const char *begin, const char *end, int newblock); /* Parser for simple replaces */
33 static int doshortlink(const char *begin, const char *end, int newblock); /* Parser for links and images */
34 static int dosurround(const char *begin, const char *end, int newblock); /* Parser for surrounding tags */
35 static int dounderline(const char *begin, const char *end, int newblock); /* Parser for underline tags */
36 static void *ereallocz(void *p, size_t size);
37 static void hprint(const char *begin, const char *end); /* Escapes HTML and prints it to output */
38 static void hprintattr(const char *begin, const char *end); /* Escapes HTML for attributes and prints it to output */
39 static void process(const char *begin, const char *end, int isblock); /* Processes range between begin and end. */
40
41 /* list of parsers */
42 static Parser parsers[] = { dounderline, docomment, dolineprefix,
43 dolist, doparagraph, dogtlt, dosurround, dolink,
44 doshortlink, dohtml, doamp, doreplace };
45 static int lazyimg = 0, nohtml = 0;
46
47 static Tag lineprefix[] = {
48 { " ", 0, "<pre><code>", "\n</code></pre>" },
49 { "\t", 0, "<pre><code>", "\n</code></pre>" },
50 { ">", 2, "<blockquote>", "</blockquote>" },
51 { "###### ", 1, "<h6>", "</h6>" },
52 { "##### ", 1, "<h5>", "</h5>" },
53 { "#### ", 1, "<h4>", "</h4>" },
54 { "### ", 1, "<h3>", "</h3>" },
55 { "## ", 1, "<h2>", "</h2>" },
56 { "# ", 1, "<h1>", "</h1>" },
57 { "- - -\n", 1, "<hr />", ""},
58 { "---\n", 1, "<hr />", ""},
59 };
60
61 static Tag underline[] = {
62 { "=", 1, "<h1>", "</h1>\n" },
63 { "-", 1, "<h2>", "</h2>\n" },
64 };
65
66 static Tag surround[] = {
67 { "``", 0, "<code>", "</code>" },
68 { "`", 0, "<code>", "</code>" },
69 { "___", 1, "<strong><em>", "</em></strong>" },
70 { "***", 1, "<strong><em>", "</em></strong>" },
71 { "__", 1, "<strong>", "</strong>" },
72 { "**", 1, "<strong>", "</strong>" },
73 { "_", 1, "<em>", "</em>" },
74 { "*", 1, "<em>", "</em>" },
75 };
76
77 static const char *replace[][2] = {
78 { "\\\\", "\\" },
79 { "\\`", "`" },
80 { "\\*", "*" },
81 { "\\_", "_" },
82 { "\\{", "{" },
83 { "\\}", "}" },
84 { "\\[", "[" },
85 { "\\]", "]" },
86 { "\\(", "(" },
87 { "\\)", ")" },
88 { "\\#", "#" },
89 { "\\+", "+" },
90 { "\\-", "-" },
91 { "\\.", "." },
92 { "\\!", "!" },
93 };
94
95 static const char *insert[][2] = {
96 { " \n", "<br />" },
97 };
98
99 void
100 eprint(const char *format, ...)
101 {
102 va_list ap;
103
104 va_start(ap, format);
105 vfprintf(stderr, format, ap);
106 va_end(ap);
107 exit(1);
108 }
109
110 int
111 doamp(const char *begin, const char *end, int newblock)
112 {
113 const char *p;
114
115 if (*begin != '&')
116 return 0;
117 if (!nohtml) {
118 for (p = begin + 1; p != end && !strchr("; \\\n\t", *p); p++)
119 ;
120 if (p == end || *p == ';')
121 return 0;
122 }
123 fputs("&", stdout);
124 return 1;
125 }
126
127 int
128 dogtlt(const char *begin, const char *end, int newblock)
129 {
130 int brpos;
131 char c;
132
133 if (nohtml || begin + 1 >= end)
134 return 0;
135 brpos = begin[1] == '>';
136 if (!brpos && *begin != '<')
137 return 0;
138 c = begin[brpos ? 0 : 1];
139 if (!brpos && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z')) {
140 fputs("<", stdout);
141 return 1;
142 } else if (brpos && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && !strchr("/\"'", c)) {
143 fprintf(stdout, "%c>",c);
144 return 2;
145 }
146 return 0;
147 }
148
149 int
150 docomment(const char *begin, const char *end, int newblock)
151 {
152 char *p;
153
154 if (nohtml || strncmp("<!--", begin, 4))
155 return 0;
156 p = strstr(begin, "-->");
157 if (!p || p + 3 >= end)
158 return 0;
159 fprintf(stdout, "%.*s\n", (int)(p + 3 - begin), begin);
160 return (p + 3 - begin) * (newblock ? -1 : 1);
161 }
162
163 int
164 dohtml(const char *begin, const char *end, int newblock)
165 {
166 const char *p, *tag, *tagend;
167
168 if (nohtml || begin + 2 >= end)
169 return 0;
170 p = begin;
171 if (p[0] != '<' || !isalpha((unsigned char)p[1]))
172 return 0;
173 p++;
174 tag = p;
175 for (; isalnum((unsigned char)*p) && p < end; p++)
176 ;
177 tagend = p;
178 if (p > end || tag == tagend)
179 return 0;
180 while ((p = strstr(p, "</")) && p < end) {
181 p += 2;
182 if (strncmp(p, tag, tagend - tag) == 0 && p[tagend - tag] == '>') {
183 p++;
184 fwrite(begin, 1, p - begin + tagend - tag + 1, stdout);
185 return p - begin + tagend - tag + 1;
186 }
187 }
188 if ((p = strchr(tagend, '>'))) {
189 fwrite(begin, 1, p - begin + 2, stdout);
190 return p - begin + 2;
191 } else {
192 return 0;
193 }
194 }
195
196 int
197 dolineprefix(const char *begin, const char *end, int newblock)
198 {
199 unsigned int i, j, l;
200 char *buffer;
201 const char *p;
202
203 if (newblock)
204 p = begin;
205 else if (*begin == '\n')
206 p = begin + 1;
207 else
208 return 0;
209 for (i = 0; i < LENGTH(lineprefix); i++) {
210 l = strlen(lineprefix[i].search);
211 if (end - p < l)
212 continue;
213 if (strncmp(lineprefix[i].search, p, l))
214 continue;
215 if (*begin == '\n')
216 putc('\n', stdout);
217 fputs(lineprefix[i].before, stdout);
218 if (lineprefix[i].search[l-1] == '\n') {
219 putc('\n', stdout);
220 return l - 1;
221 }
222 if (!(buffer = malloc(BUFSIZ)))
223 eprint("malloc");
224 buffer[0] = '\0';
225
226 /* Collect lines into buffer while they start with the prefix */
227 j = 0;
228 while ((strncmp(lineprefix[i].search, p, l) == 0) && p + l < end) {
229 p += l;
230
231 /* Special case for blockquotes: optional space after > */
232 if (lineprefix[i].search[0] == '>' && *p == ' ') {
233 p++;
234 }
235
236 while (p < end) {
237 ADDC(buffer, j) = *p;
238 j++;
239 if (*(p++) == '\n')
240 break;
241 }
242 }
243
244 /* Skip empty lines in block */
245 while (*(buffer + j - 1) == '\n')
246 j--;
247
248 ADDC(buffer, j) = '\0';
249 if (lineprefix[i].process)
250 process(buffer, buffer + strlen(buffer), lineprefix[i].process >= 2);
251 else
252 hprint(buffer, buffer + strlen(buffer));
253 puts(lineprefix[i].after);
254 free(buffer);
255 return -(p - begin);
256 }
257 return 0;
258 }
259
260 int
261 dolink(const char *begin, const char *end, int newblock)
262 {
263 long width = 0, height = 0;
264 int img, len, parens_depth = 1;
265 char *numend;
266 const char *desc, *link, *p, *q, *descend, *linkend;
267 const char *title = NULL, *titleend = NULL;
268
269 if (*begin == '[')
270 img = 0;
271 else if (strncmp(begin, ") || p > end)
277 return 0;
278 for (q = strstr(desc, ") || p > end)
280 return 0;
281 descend = p;
282 link = p + 2;
283
284 /* find end of link while handling nested parens */
285 q = link;
286 while (parens_depth) {
287 if (!(q = strpbrk(q, "()")) || q > end)
288 return 0;
289 if (*q == '(')
290 parens_depth++;
291 else
292 parens_depth--;
293 if (parens_depth && q < end)
294 q++;
295 }
296
297 linkend = q;
298 if (*link == '<' && *(linkend - 1) == '>') {
299 link++;
300 linkend--;
301 } else {
302 /* trim leading spaces */
303 for (p = link; p < q && isspace((unsigned char)*p); p++)
304 ;
305
306 for (link = p; p < q; p++) {
307 if (*p == '=' && img && p != link &&
308 isspace((unsigned char)p[-1])) {
309 /* image dimensions */
310 linkend = p;
311 width = strtol(++p, &numend, 10);
312 p = numend;
313 if (*numend == 'x')
314 height = strtol(++p, &numend, 10);
315 } else if ((*p == '\'' || *p == '"') && p != link &&
316 isspace((unsigned char)p[-1])) {
317 /* title attribute: for links and images */
318 linkend = p;
319 title = ++p;
320 if ((titleend = strchr(title, *(p - 1)))) {
321 if (titleend >= q)
322 titleend = q;
323 else
324 p = titleend;
325 }
326 }
327 }
328
329 /* trim trailing spaces from link */
330 for (; linkend > link && isspace((unsigned char)linkend[-1]); linkend--)
331 ;
332 }
333
334 len = q + 1 - begin;
335 if (img) {
336 fputs("<img src=\"", stdout);
337 hprintattr(link, linkend);
338 fputs("\" alt=\"", stdout);
339 hprintattr(desc, descend);
340 fputs("\" ", stdout);
341 if (title && titleend && title != titleend) {
342 fputs("title=\"", stdout);
343 hprintattr(title, titleend);
344 fputs("\" ", stdout);
345 }
346 if (width > 0)
347 printf("width=\"%ld\" ", width);
348 if (height > 0)
349 printf("height=\"%ld\" ", height);
350 if (width > 0 && height > 0 && lazyimg)
351 fputs("loading=\"lazy\" ", stdout);
352 fputs("/>", stdout);
353 } else {
354 fputs("<a href=\"", stdout);
355 hprintattr(link, linkend);
356 fputs("\"", stdout);
357 if (title && titleend && title != titleend) {
358 fputs(" title=\"", stdout);
359 hprintattr(title, titleend);
360 fputs("\"", stdout);
361 }
362 fputs(">", stdout);
363 process(desc, descend, 0);
364 fputs("</a>", stdout);
365 }
366 return len;
367 }
368
369 int
370 dolist(const char *begin, const char *end, int newblock)
371 {
372 unsigned int i, j, indent, run, ul, isblock;
373 const char *p, *q;
374 char *buffer = NULL;
375 char marker;
376
377 isblock = 0;
378 if (newblock)
379 p = begin;
380 else if (*begin == '\n')
381 p = begin + 1;
382 else
383 return 0;
384 q = p;
385 if (*p == '-' || *p == '*' || *p == '+') {
386 ul = 1;
387 marker = *p;
388 } else {
389 ul = 0;
390 for (; p < end && *p >= '0' && *p <= '9'; p++)
391 ;
392 if (p >= end || *p != '.')
393 return 0;
394 }
395 p++;
396 if (p >= end || !(*p == ' ' || *p == '\t'))
397 return 0;
398 for (p++; p != end && (*p == ' ' || *p == '\t'); p++)
399 ;
400 indent = p - q;
401 buffer = ereallocz(buffer, BUFSIZ);
402 if (!newblock)
403 putc('\n', stdout);
404 fputs(ul ? "<ul>\n" : "<ol>\n", stdout);
405 run = 1;
406 for (; p < end && run; p++) {
407 for (i = 0; p < end && run; p++, i++) {
408 if (*p == '\n') {
409 if (p + 1 == end) {
410 break;
411 } else {
412 /* Handle empty lines */
413 for (q = p + 1; (*q == ' ' || *q == '\t') && q < end; q++)
414 ;
415 if (*q == '\n') {
416 ADDC(buffer, i) = '\n';
417 i++;
418 run = 0;
419 isblock++;
420 p = q;
421 }
422 }
423 q = p + 1;
424 j = 0;
425 if (ul && *q == marker) {
426 j = 1;
427 } else if (!ul) {
428 for (; q + j != end && q[j] >= '0' && q[j] <= '9' && j < indent; j++)
429 ;
430 if (q + j == end)
431 break;
432 if (j > 0 && q[j] == '.')
433 j++;
434 else
435 j = 0;
436 }
437 if (q + indent < end)
438 for (; (q[j] == ' ' || q[j] == '\t') && j < indent; j++)
439 ;
440 if (j == indent) {
441 ADDC(buffer, i) = '\n';
442 i++;
443 p += indent;
444 run = 1;
445 if (*q == ' ' || *q == '\t')
446 p++;
447 else
448 break;
449 } else if (j < indent) {
450 run = 0;
451 }
452 }
453 ADDC(buffer, i) = *p;
454 }
455 ADDC(buffer, i) = '\0';
456 fputs("<li>", stdout);
457 process(buffer, buffer + i, isblock > 1 || (isblock == 1 && run));
458 fputs("</li>\n", stdout);
459 }
460 fputs(ul ? "</ul>\n" : "</ol>\n", stdout);
461 free(buffer);
462 p--;
463 while (*(--p) == '\n')
464 ;
465
466 return -(p - begin + 1);
467 }
468
469 int
470 doparagraph(const char *begin, const char *end, int newblock)
471 {
472 const char *p;
473
474 if (!newblock)
475 return 0;
476 p = strstr(begin, "\n\n");
477 if (!p || p > end)
478 p = end;
479 if (p - begin <= 1)
480 return 0;
481 fputs("<p>", stdout);
482 process(begin, p, 0);
483 fputs("</p>\n", stdout);
484
485 return -(p - begin);
486 }
487
488 int
489 doreplace(const char *begin, const char *end, int newblock)
490 {
491 unsigned int i, l;
492
493 for (i = 0; i < LENGTH(insert); i++)
494 if (strncmp(insert[i][0], begin, strlen(insert[i][0])) == 0)
495 fputs(insert[i][1], stdout);
496 for (i = 0; i < LENGTH(replace); i++) {
497 l = strlen(replace[i][0]);
498 if (end - begin < l)
499 continue;
500 if (strncmp(replace[i][0], begin, l) == 0) {
501 fputs(replace[i][1], stdout);
502 return l;
503 }
504 }
505 return 0;
506 }
507
508 int
509 doshortlink(const char *begin, const char *end, int newblock)
510 {
511 const char *p, *c;
512 int ismail = 0;
513
514 if (*begin != '<')
515 return 0;
516 for (p = begin + 1; p != end; p++) {
517 switch(*p) {
518 case ' ':
519 case '\t':
520 case '\n':
521 return 0;
522 case '#':
523 case ':':
524 ismail = -1;
525 break;
526 case '@':
527 if (ismail == 0)
528 ismail = 1;
529 break;
530 case '>':
531 if (ismail == 0)
532 return 0;
533 fputs("<a href=\"", stdout);
534 if (ismail == 1) {
535 /* mailto: */
536 fputs("mailto:", stdout);
537 for (c = begin + 1; *c != '>'; c++)
538 fprintf(stdout, "&#%u;", *c);
539 fputs("\">", stdout);
540 for (c = begin + 1; *c != '>'; c++)
541 fprintf(stdout, "&#%u;", *c);
542 } else {
543 hprintattr(begin + 1, p);
544 fputs("\">", stdout);
545 hprint(begin + 1, p);
546 }
547 fputs("</a>", stdout);
548 return p - begin + 1;
549 }
550 }
551 return 0;
552 }
553
554 int
555 dosurround(const char *begin, const char *end, int newblock)
556 {
557 unsigned int i, l;
558 const char *p, *start, *stop;
559
560 for (i = 0; i < LENGTH(surround); i++) {
561 l = strlen(surround[i].search);
562 if (end - begin < 2*l || strncmp(begin, surround[i].search, l) != 0)
563 continue;
564 start = begin + l;
565 p = start - 1;
566 do {
567 stop = p;
568 p = strstr(p + 1, surround[i].search);
569 } while (p && p[-1] == '\\');
570 if (p && p[-1] != '\\')
571 stop = p;
572 if (!stop || stop < start || stop >= end)
573 continue;
574 fputs(surround[i].before, stdout);
575
576 /* Single space at start and end are ignored */
577 if (*start == ' ' && *(stop - 1) == ' ') {
578 start++;
579 stop--;
580 l++;
581 }
582
583 if (surround[i].process)
584 process(start, stop, 0);
585 else
586 hprint(start, stop);
587 fputs(surround[i].after, stdout);
588 return stop - begin + l;
589 }
590 return 0;
591 }
592
593 int
594 dounderline(const char *begin, const char *end, int newblock)
595 {
596 unsigned int i, j, l;
597 const char *p;
598
599 if (!newblock)
600 return 0;
601 p = begin;
602 for (l = 0; p + l != end && p[l] != '\n'; l++)
603 ;
604 p += l + 1;
605 if (l == 0)
606 return 0;
607 for (i = 0; i < LENGTH(underline); i++) {
608 for (j = 0; p + j != end && p[j] != '\n' && p[j] == underline[i].search[0]; j++)
609 ;
610 if (j == l || (p[j] == '\n' && j > 3)) {
611 fputs(underline[i].before, stdout);
612 if (underline[i].process)
613 process(begin, begin + l, 0);
614 else
615 hprint(begin, begin + l);
616 fputs(underline[i].after, stdout);
617 return -(j + p - begin);
618 }
619 }
620 return 0;
621 }
622
623 void *
624 ereallocz(void *p, size_t size)
625 {
626 void *res;
627
628 res = realloc(p, size);
629 if (!res)
630 eprint("realloc: %zu bytes\n", size);
631 return res;
632 }
633
634 void
635 hprintattr(const char *begin, const char *end)
636 {
637 const char *p;
638
639 for (p = begin; p != end; p++) {
640 if (*p == '&')
641 fputs("&", stdout);
642 else if (*p == '"')
643 fputs(""", stdout);
644 else if (*p == '>')
645 fputs(">", stdout);
646 else if (*p == '<')
647 fputs("<", stdout);
648 else
649 putc(*p, stdout);
650 }
651 }
652
653 void
654 hprint(const char *begin, const char *end)
655 {
656 const char *p;
657
658 for (p = begin; p != end; p++) {
659 if (*p == '&')
660 fputs("&", stdout);
661 else if (*p == '>')
662 fputs(">", stdout);
663 else if (*p == '<')
664 fputs("<", stdout);
665 else
666 putc(*p, stdout);
667 }
668 }
669
670 void
671 process(const char *begin, const char *end, int newblock)
672 {
673 const char *p, *q;
674 int affected;
675 unsigned int i;
676
677 for (p = begin; p < end;) {
678 if (newblock)
679 while (*p == '\n')
680 if (++p == end)
681 return;
682 affected = 0;
683 for (i = 0; i < LENGTH(parsers) && !affected; i++)
684 affected = parsers[i](p, end, newblock);
685 p += abs(affected);
686 if (!affected) {
687 if (nohtml)
688 hprint(p, p + 1);
689 else
690 putc(*p, stdout);
691 p++;
692 }
693 for (q = p; q != end && *q == '\n'; q++)
694 ;
695 if (q == end)
696 return;
697 else if (p[0] == '\n' && p + 1 != end && p[1] == '\n')
698 newblock = 1;
699 else
700 newblock = affected < 0;
701 }
702 }
703
704 void
705 usage(char **argv)
706 {
707 eprint("usage: %s [-l] [-n] [file]\n", argv[0]);
708 }
709
710 int
711 main(int argc, char *argv[])
712 {
713 FILE *source = stdin;
714 char *buffer = NULL;
715 int s, i;
716 unsigned long len, bsize;
717
718 for (i = 1; i < argc; i++) {
719 if (!strcmp("-v", argv[i])) {
720 eprint("smu v%s\n", VERSION);
721 } else if (!strcmp("-n", argv[i])) {
722 nohtml = 1;
723 } else if (!strcmp("-l", argv[i])) {
724 lazyimg = 1;
725 } else if (argv[i][0] != '-') {
726 break; /* file specified */
727 } else if (!strcmp("--", argv[i])) {
728 i++;
729 break;
730 } else {
731 usage(argv);
732 }
733 }
734 if (i < argc && !(source = fopen(argv[i], "r")))
735 eprint("fopen: %s: %s\n", argv[i], strerror(errno));
736
737 if (pledge("stdio", NULL) == -1)
738 eprint("pledge");
739
740 bsize = 2 * BUFSIZ;
741 buffer = ereallocz(buffer, bsize);
742 len = 0;
743 while ((s = fread(buffer + len, 1, BUFSIZ, source))) {
744 len += s;
745 if (BUFSIZ + len + 1 > bsize) {
746 bsize += BUFSIZ;
747 if (!(buffer = realloc(buffer, bsize)))
748 eprint("realloc");
749 }
750 }
751 buffer[len] = '\0';
752 process(buffer, buffer + len, 1);
753 free(buffer);
754 if (source != stdin)
755 fclose(source);
756
757 return 0;
758 }