Raw File
/*============================================================================
epub2txt v2
xhtml.c
Copyright (c)2020 Kevin Boone, GPL v3.0
============================================================================*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/stat.h>
#ifndef __APPLE__
#include <malloc.h>
#endif
#include "epub2txt.h"
#include "log.h"
#include "custom_string.h"
#include "wstring.h"
#include "wrap.h"
#include "xhtml.h"
/*============================================================================
Format definition stuff
============================================================================*/
typedef enum { FORMAT_NONE,
FORMAT_BOLD_ON, FORMAT_BOLD_OFF,
FORMAT_ITALIC_ON, FORMAT_ITALIC_OFF,
FORMAT_H1_ON, FORMAT_H1_OFF,
FORMAT_H2_ON, FORMAT_H2_OFF,
FORMAT_H3_ON, FORMAT_H3_OFF,
FORMAT_H4_ON, FORMAT_H4_OFF,
FORMAT_H5_ON, FORMAT_H5_OFF } Format;
/* bitmasks for ANSI highlighting */
enum { FMT_BOLD = 1 << 0,
FMT_ITAL = 1 << 1 };
/*============================================================================
xhtml_is_start_format_tag
============================================================================*/
BOOL xhtml_is_start_format_tag (const char *tag, Format *format)
{
if (strcasecmp (tag, "b") == 0)
{
*format = FORMAT_BOLD_ON;
return TRUE;
}
if (strcasecmp (tag, "i") == 0)
{
*format = FORMAT_ITALIC_ON;
return TRUE;
}
return FALSE;
}
/*============================================================================
xhtml_is_end_breaking_tag
============================================================================*/
BOOL xhtml_is_end_breaking_tag (const char *tag, Format *format)
{
if (strcasecmp (tag, "/h1") == 0)
{
*format = FORMAT_BOLD_OFF;
return TRUE;
}
if (strcasecmp (tag, "/h2") == 0)
{
*format = FORMAT_BOLD_OFF;
return TRUE;
}
if (strcasecmp (tag, "/h3") == 0)
{
*format = FORMAT_BOLD_OFF;
return TRUE;
}
if (strcasecmp (tag, "/h4") == 0)
{
*format = FORMAT_BOLD_OFF;
return TRUE;
}
if (strcasecmp (tag, "/h5") == 0)
{
*format = FORMAT_BOLD_OFF;
return TRUE;
}
if (strcasecmp (tag, "/div") == 0)
{
*format = FORMAT_NONE;
return TRUE;
}
if (strcasecmp (tag, "/blockquote") == 0)
{
*format = FORMAT_NONE;
return TRUE;
}
return FALSE;
}
/*============================================================================
xhtml_is_end_format_tag
============================================================================*/
BOOL xhtml_is_end_format_tag (const char *tag, Format *format)
{
if (strcasecmp (tag, "/b") == 0)
{
*format = FORMAT_BOLD_OFF;
return TRUE;
}
if (strcasecmp (tag, "/i") == 0)
{
*format = FORMAT_ITALIC_OFF;
return TRUE;
}
return FALSE;
}
/*============================================================================
xhtml_is_start_breaking_tag
============================================================================*/
BOOL xhtml_is_start_breaking_tag (const char *tag, Format *format)
{
if (strcasecmp (tag, "h1") == 0)
{
*format = FORMAT_BOLD_ON;
return TRUE;
}
if (strcasecmp (tag, "h2") == 0)
{
*format = FORMAT_BOLD_ON;
return TRUE;
}
if (strcasecmp (tag, "h3") == 0)
{
*format = FORMAT_BOLD_ON;
return TRUE;
}
if (strcasecmp (tag, "h4") == 0)
{
*format = FORMAT_BOLD_ON;
return TRUE;
}
if (strcasecmp (tag, "h5") == 0)
{
*format = FORMAT_BOLD_ON;
return TRUE;
}
if (strcasecmp (tag, "div") == 0)
{
*format = FORMAT_NONE;
return TRUE;
}
if (strcasecmp (tag, "blockquote") == 0)
{
*format = FORMAT_NONE;
return TRUE;
}
return FALSE;
}
/*============================================================================
xhtml_emit_format
============================================================================*/
void xhtml_emit_format (const Epub2TxtOptions *options, Format format)
{
IN
if (options->ansi && !options->raw)
{
switch (format)
{
case FORMAT_BOLD_ON:
printf ("\x1B[1m"); break;
case FORMAT_BOLD_OFF:
printf ("\x1B[0m"); break;
case FORMAT_ITALIC_ON:
printf ("\x1B[3m"); break;
case FORMAT_ITALIC_OFF:
printf ("\x1B[0m"); break;
case FORMAT_NONE:
break;
case FORMAT_H1_ON:
case FORMAT_H2_ON:
case FORMAT_H3_ON:
case FORMAT_H4_ON:
case FORMAT_H5_ON:
printf ("\x1B[1m"); break;
case FORMAT_H1_OFF:
case FORMAT_H2_OFF:
case FORMAT_H3_OFF:
case FORMAT_H4_OFF:
case FORMAT_H5_OFF:
printf ("\x1B[0m"); break;
}
}
OUT
}
/*============================================================================
xhtml_emit_fmt_eol_pre
============================================================================*/
void xhtml_emit_fmt_eol_pre (WrapTextContext *context)
{
IN
unsigned int fmt = wraptext_context_get_fmt (context);
const Epub2TxtOptions *options = (Epub2TxtOptions *) wraptext_context_get_app_opts (context);
if (options->ansi && !options->raw && fmt)
{
/* reset ANSI escape-sequence at EOL. */
xhtml_emit_format (options, FORMAT_BOLD_OFF);
}
OUT
}
/*============================================================================
xhtml_emit_fmt_eol_post
============================================================================*/
void xhtml_emit_fmt_eol_post (WrapTextContext *context)
{
IN
unsigned int fmt = wraptext_context_get_fmt (context);
const Epub2TxtOptions *options = (Epub2TxtOptions *) wraptext_context_get_app_opts (context);
if (options->ansi && !options->raw && fmt)
{
/* turn those set, back on at BOL. */
if (fmt & FMT_BOLD)
xhtml_emit_format (options, FORMAT_BOLD_ON);
if (fmt & FMT_ITAL)
{
xhtml_emit_format (options, FORMAT_ITALIC_ON);
}
}
OUT
}
/*============================================================================
xhtml_set_format
============================================================================*/
void xhtml_set_format (const Epub2TxtOptions *options, Format format, WrapTextContext *context)
{
IN
if (options->ansi && !options->raw)
{
switch (format)
{
case FORMAT_BOLD_ON:
wraptext_context_set_fmt (context, FMT_BOLD);
break;
case FORMAT_BOLD_OFF:
wraptext_context_reset_fmt (context, FMT_BOLD);
break;
case FORMAT_ITALIC_ON:
wraptext_context_set_fmt (context, FMT_ITAL);
break;
case FORMAT_ITALIC_OFF:
wraptext_context_reset_fmt (context, FMT_ITAL);
break;
case FORMAT_NONE:
wraptext_context_zero_fmt (context);
break;
case FORMAT_H1_ON:
case FORMAT_H2_ON:
case FORMAT_H3_ON:
case FORMAT_H4_ON:
case FORMAT_H5_ON:
wraptext_context_set_fmt (context, FMT_BOLD);
break;
case FORMAT_H1_OFF:
case FORMAT_H2_OFF:
case FORMAT_H3_OFF:
case FORMAT_H4_OFF:
case FORMAT_H5_OFF:
wraptext_context_reset_fmt (context, FMT_BOLD);
break;
}
}
OUT
}
/*============================================================================
xhtml_transform_char
============================================================================*/
WString *xhtml_transform_char (uint32_t c, BOOL to_ascii)
{
WString *ret = wstring_create_empty();
if (to_ascii && c > 127) // No ASCII chars will need transforming
{
if (c == 0x00B4) return wstring_create_from_utf8 ("\'");
if (c == 0x0304) return wstring_create_from_utf8 ("-");
if (c == 0x2010) return wstring_create_from_utf8 ("-");
if (c == 0x2013) return wstring_create_from_utf8 ("-");
if (c == 0x2014) return wstring_create_from_utf8 ("-");
if (c == 0x2018) return wstring_create_from_utf8 ("'");
if (c == 0x2019) return wstring_create_from_utf8 ("\'");
if (c == 0x201C) return wstring_create_from_utf8 ("\"");
if (c == 0x201D) return wstring_create_from_utf8 ("\"");
if (c == 0xC2A0) return wstring_create_from_utf8 ("(c)"); // copyright
if (c == 0x00A9) return wstring_create_from_utf8 ("(c)"); // ditto
if (c == 0xC2A9) return wstring_create_from_utf8 (" "); // nbsp
if (c == 0x00A0) return wstring_create_from_utf8 (" "); // nbsp
if (c == 0x2026) return wstring_create_from_utf8 (",,,"); // elipsis
if (c == 0x2022) return wstring_create_from_utf8 ("."); // dot
if (c == 0x00B5) return wstring_create_from_utf8 ("u"); // mu
if (c == 0x00C0) return wstring_create_from_utf8 ("A"); // accented A
if (c == 0x00C1) return wstring_create_from_utf8 ("A"); // accented A
if (c == 0x00C2) return wstring_create_from_utf8 ("A"); // accented A
if (c == 0x00C3) return wstring_create_from_utf8 ("A"); // accented A
if (c == 0x00C4) return wstring_create_from_utf8 ("A"); // accented A
if (c == 0x00C5) return wstring_create_from_utf8 ("A"); // accented A
if (c == 0x00C6) return wstring_create_from_utf8 ("AE"); // accented A
if (c == 0x00C7) return wstring_create_from_utf8 ("C"); // cedilla
if (c == 0x00C8) return wstring_create_from_utf8 ("E"); // accented E
if (c == 0x00C9) return wstring_create_from_utf8 ("E"); // accented E
if (c == 0x00CA) return wstring_create_from_utf8 ("E"); // accented E
if (c == 0x00CB) return wstring_create_from_utf8 ("E"); // accented E
if (c == 0x00CC) return wstring_create_from_utf8 ("I"); // accented I
if (c == 0x00CD) return wstring_create_from_utf8 ("I"); // accented I
if (c == 0x00CE) return wstring_create_from_utf8 ("I"); // accented I
if (c == 0x00CF) return wstring_create_from_utf8 ("I"); // accented I
if (c == 0x00D0) return wstring_create_from_utf8 ("D"); // accented D
if (c == 0x00D1) return wstring_create_from_utf8 ("N"); // accented N
if (c == 0x00D2) return wstring_create_from_utf8 ("O"); // accented O
if (c == 0x00D3) return wstring_create_from_utf8 ("O"); // accented O
if (c == 0x00D4) return wstring_create_from_utf8 ("O"); // accented O
if (c == 0x00D5) return wstring_create_from_utf8 ("O"); // accented O
if (c == 0x00D6) return wstring_create_from_utf8 ("O"); // accented O
if (c == 0x00D7) return wstring_create_from_utf8 ("x"); // Multiply
if (c == 0x00D8) return wstring_create_from_utf8 ("O"); // accented O
if (c == 0x00D9) return wstring_create_from_utf8 ("U"); // accented U
if (c == 0x00DA) return wstring_create_from_utf8 ("U"); // accented U
if (c == 0x00DB) return wstring_create_from_utf8 ("U"); // accented U
if (c == 0x00DC) return wstring_create_from_utf8 ("U"); // accented U
if (c == 0x00DD) return wstring_create_from_utf8 ("Y"); // accented Y
if (c == 0x00DE) return wstring_create_from_utf8 ("Y"); // thorn
if (c == 0x00DF) return wstring_create_from_utf8 ("sz"); // esszet
if (c == 0x00E0) return wstring_create_from_utf8 ("a"); // accepted a
if (c == 0x00E1) return wstring_create_from_utf8 ("a"); // accepted a
if (c == 0x00E2) return wstring_create_from_utf8 ("a"); // accepted a
if (c == 0x00E3) return wstring_create_from_utf8 ("a"); // accepted a
if (c == 0x00E4) return wstring_create_from_utf8 ("a"); // accepted a
if (c == 0x00E5) return wstring_create_from_utf8 ("a"); // accepted a
if (c == 0x00E6) return wstring_create_from_utf8 ("ae"); // ae
if (c == 0x00E7) return wstring_create_from_utf8 ("c"); // cedilla
if (c == 0x00E8) return wstring_create_from_utf8 ("e"); //a ceepnted e
if (c == 0x00E9) return wstring_create_from_utf8 ("e"); //a ceepnted e
if (c == 0x00EA) return wstring_create_from_utf8 ("e"); //a ceepnted e
if (c == 0x00EB) return wstring_create_from_utf8 ("e"); //a ceepnted e
if (c == 0x00EC) return wstring_create_from_utf8 ("i"); //a ceepnted i
if (c == 0x00ED) return wstring_create_from_utf8 ("i"); //a ceepnted i
if (c == 0x00EE) return wstring_create_from_utf8 ("i"); //a ceepnted i
if (c == 0x00EF) return wstring_create_from_utf8 ("i"); //a ceepnted i
if (c == 0x00F0) return wstring_create_from_utf8 ("o"); //a ceepnted o
if (c == 0x00F1) return wstring_create_from_utf8 ("n"); //a ceepnted n
if (c == 0x00F2) return wstring_create_from_utf8 ("o"); //a ceepnted o
if (c == 0x00F3) return wstring_create_from_utf8 ("o"); //a ceepnted o
if (c == 0x00F4) return wstring_create_from_utf8 ("o"); //a ceepnted o
if (c == 0x00F5) return wstring_create_from_utf8 ("o"); //a ceepnted o
if (c == 0x00F6) return wstring_create_from_utf8 ("o"); //a ceepnted o
if (c == 0x00F7) return wstring_create_from_utf8 ("/"); // divide
if (c == 0x00F8) return wstring_create_from_utf8 ("o"); //a ceepnted o
if (c == 0x00F9) return wstring_create_from_utf8 ("u"); //a ceepnted u
if (c == 0x00FA) return wstring_create_from_utf8 ("u"); //a ceepnted u
if (c == 0x00FB) return wstring_create_from_utf8 ("u"); //a ceepnted u
if (c == 0x00FC) return wstring_create_from_utf8 ("u"); //a ceepnted u
if (c == 0x00FD) return wstring_create_from_utf8 ("y"); //a ceepnted y
if (c == 0x00FE) return wstring_create_from_utf8 ("y"); //a thorn
if (c == 0x00FF) return wstring_create_from_utf8 ("y"); //a ceepnted y
if (c == 0x0100) return wstring_create_from_utf8 ("A"); //a ceepnted A
if (c == 0x0101) return wstring_create_from_utf8 ("a"); //a ceepnted a
if (c == 0x0102) return wstring_create_from_utf8 ("A"); //a ceepnted A
if (c == 0x0103) return wstring_create_from_utf8 ("a"); //a ceepnted a
if (c == 0x0104) return wstring_create_from_utf8 ("A"); //a ceepnted A
if (c == 0x0105) return wstring_create_from_utf8 ("a"); //a ceepnted a
if (c == 0x0106) return wstring_create_from_utf8 ("C"); //a ceepnted C
if (c == 0x0107) return wstring_create_from_utf8 ("c"); //a ceepnted c
if (c == 0x0108) return wstring_create_from_utf8 ("C"); //a ceepnted C
if (c == 0x0109) return wstring_create_from_utf8 ("c"); //a ceepnted c
if (c == 0x010A) return wstring_create_from_utf8 ("C"); //a ceepnted C
if (c == 0x010B) return wstring_create_from_utf8 ("c"); //a ceepnted c
if (c == 0x010C) return wstring_create_from_utf8 ("C"); //a ceepnted C
if (c == 0x010D) return wstring_create_from_utf8 ("c"); //a ceepnted c
if (c == 0x010E) return wstring_create_from_utf8 ("D"); //a ceepnted D
if (c == 0x010F) return wstring_create_from_utf8 ("d"); //a ceepnted d
if (c == 0x0110) return wstring_create_from_utf8 ("D"); //a ceepnted D
if (c == 0x0111) return wstring_create_from_utf8 ("d"); //a ceepnted d
if (c == 0x0112) return wstring_create_from_utf8 ("E"); //a ceepnted E
if (c == 0x0113) return wstring_create_from_utf8 ("e"); //a ceepnted e
if (c == 0x0114) return wstring_create_from_utf8 ("E"); //a ceepnted E
if (c == 0x0115) return wstring_create_from_utf8 ("e"); //a ceepnted e
if (c == 0x0116) return wstring_create_from_utf8 ("E"); //a ceepnted E
if (c == 0x0117) return wstring_create_from_utf8 ("e"); //a ceepnted e
if (c == 0x0118) return wstring_create_from_utf8 ("E"); //a ceepnted E
if (c == 0x0119) return wstring_create_from_utf8 ("e"); //a ceepnted e
if (c == 0x011A) return wstring_create_from_utf8 ("E"); //a ceepnted E
if (c == 0x011B) return wstring_create_from_utf8 ("e"); //a ceepnted e
if (c == 0x011C) return wstring_create_from_utf8 ("G"); //a ceepnted G
if (c == 0x011D) return wstring_create_from_utf8 ("g"); //a ceepnted g
if (c == 0x011E) return wstring_create_from_utf8 ("G"); //a ceepnted G
if (c == 0x011F) return wstring_create_from_utf8 ("g"); //a ceepnted g
if (c == 0x0120) return wstring_create_from_utf8 ("G"); //a ceepnted G
if (c == 0x0121) return wstring_create_from_utf8 ("g"); //a ceepnted g
if (c == 0x0122) return wstring_create_from_utf8 ("G"); //a ceepnted G
if (c == 0x0123) return wstring_create_from_utf8 ("g"); //a ceepnted g
if (c == 0x0124) return wstring_create_from_utf8 ("H"); //a ceepnted H
if (c == 0x0125) return wstring_create_from_utf8 ("h"); //a ceepnted h
if (c == 0x0126) return wstring_create_from_utf8 ("H"); //a ceepnted H
if (c == 0x0127) return wstring_create_from_utf8 ("h"); //a ceepnted h
if (c == 0x0128) return wstring_create_from_utf8 ("I"); //a ceepnted I
if (c == 0x0129) return wstring_create_from_utf8 ("i"); //a ceepnted i
if (c == 0x012A) return wstring_create_from_utf8 ("I"); //a ceepnted I
if (c == 0x012B) return wstring_create_from_utf8 ("i"); //a ceepnted i
if (c == 0x012C) return wstring_create_from_utf8 ("I"); //a ceepnted I
if (c == 0x012D) return wstring_create_from_utf8 ("i"); //a ceepnted i
if (c == 0x012E) return wstring_create_from_utf8 ("I"); //a ceepnted I
if (c == 0x012F) return wstring_create_from_utf8 ("i"); //a ceepnted i
if (c == 0x0130) return wstring_create_from_utf8 ("I"); //a ceepnted I
if (c == 0x0131) return wstring_create_from_utf8 ("i"); //a ceepnted i
if (c == 0x0132) return wstring_create_from_utf8 ("IJ");
if (c == 0x0133) return wstring_create_from_utf8 ("ij");
if (c == 0x0134) return wstring_create_from_utf8 ("J"); //a ceepnted J
if (c == 0x0135) return wstring_create_from_utf8 ("j"); //a ceepnted j
if (c == 0x0136) return wstring_create_from_utf8 ("K"); //a ceepnted K
if (c == 0x0138) return wstring_create_from_utf8 ("K"); //a ceepnted K
if (c == 0x0138) return wstring_create_from_utf8 ("k"); //a ceepnted k
if (c == 0x0139) return wstring_create_from_utf8 ("L"); //a ceepnted L
if (c == 0x013A) return wstring_create_from_utf8 ("l"); //a ceepnted l
if (c == 0x013B) return wstring_create_from_utf8 ("L"); //a ceepnted L
if (c == 0x013C) return wstring_create_from_utf8 ("l"); //a ceepnted l
if (c == 0x013D) return wstring_create_from_utf8 ("L"); //a ceepnted L
if (c == 0x013E) return wstring_create_from_utf8 ("l"); //a ceepnted l
if (c == 0x013F) return wstring_create_from_utf8 ("L"); //a ceepnted L
if (c == 0x0140) return wstring_create_from_utf8 ("l"); //a ceepnted l
if (c == 0x0141) return wstring_create_from_utf8 ("L"); //a ceepnted L
if (c == 0x0142) return wstring_create_from_utf8 ("l"); //a ceepnted l
if (c == 0x0143) return wstring_create_from_utf8 ("N"); //a ceepnted N
if (c == 0x0144) return wstring_create_from_utf8 ("n"); //a ceepnted N
if (c == 0x0145) return wstring_create_from_utf8 ("N"); //a ceepnted N
if (c == 0x0146) return wstring_create_from_utf8 ("n"); //a ceepnted N
if (c == 0x0147) return wstring_create_from_utf8 ("N"); //a ceepnted N
if (c == 0x0148) return wstring_create_from_utf8 ("n"); //a ceepnted N
if (c == 0x0149) return wstring_create_from_utf8 ("N"); //a ceepnted N
if (c == 0x014A) return wstring_create_from_utf8 ("n"); //a ceepnted N
if (c == 0x014B) return wstring_create_from_utf8 ("n"); //a ceepnted n
if (c == 0x014C) return wstring_create_from_utf8 ("O"); //a ceepnted O
if (c == 0x014D) return wstring_create_from_utf8 ("o"); //a ceepnted o
if (c == 0x014E) return wstring_create_from_utf8 ("O"); //a ceepnted O
if (c == 0x014F) return wstring_create_from_utf8 ("o"); //a ceepnted o
if (c == 0x0150) return wstring_create_from_utf8 ("O"); //a ceepnted O
if (c == 0x0151) return wstring_create_from_utf8 ("o"); //a ceepnted o
if (c == 0x0152) return wstring_create_from_utf8 ("OE");
if (c == 0x0153) return wstring_create_from_utf8 ("oe");
if (c == 0x0154) return wstring_create_from_utf8 ("R"); // accepted R
if (c == 0x0155) return wstring_create_from_utf8 ("r"); // accepted r
if (c == 0x0156) return wstring_create_from_utf8 ("R"); // accepted R
if (c == 0x0157) return wstring_create_from_utf8 ("r"); // accepted r
if (c == 0x0158) return wstring_create_from_utf8 ("R"); // accepted R
if (c == 0x0159) return wstring_create_from_utf8 ("r"); // accepted r
if (c == 0x015A) return wstring_create_from_utf8 ("S"); // accepted S
if (c == 0x015B) return wstring_create_from_utf8 ("s"); // accepted s
if (c == 0x015C) return wstring_create_from_utf8 ("S"); // accepted S
if (c == 0x015D) return wstring_create_from_utf8 ("s"); // accepted s
if (c == 0x015E) return wstring_create_from_utf8 ("S"); // accepted S
if (c == 0x015F) return wstring_create_from_utf8 ("s"); // accepted s
if (c == 0x0160) return wstring_create_from_utf8 ("S"); // accepted S
if (c == 0x0161) return wstring_create_from_utf8 ("s"); // accepted s
if (c == 0x0162) return wstring_create_from_utf8 ("T"); // accepted T
if (c == 0x0163) return wstring_create_from_utf8 ("t"); // accepted t
if (c == 0x0164) return wstring_create_from_utf8 ("T"); // accepted T
if (c == 0x0165) return wstring_create_from_utf8 ("t"); // accepted t
if (c == 0x0166) return wstring_create_from_utf8 ("T"); // accepted T
if (c == 0x0167) return wstring_create_from_utf8 ("t"); // accepted t
if (c == 0x0168) return wstring_create_from_utf8 ("U"); // accepted U
if (c == 0x0169) return wstring_create_from_utf8 ("u"); // accepted u
if (c == 0x016A) return wstring_create_from_utf8 ("U"); // accepted U
if (c == 0x016B) return wstring_create_from_utf8 ("u"); // accepted u
if (c == 0x016C) return wstring_create_from_utf8 ("U"); // accepted U
if (c == 0x016D) return wstring_create_from_utf8 ("u"); // accepted u
if (c == 0x016E) return wstring_create_from_utf8 ("U"); // accepted U
if (c == 0x016F) return wstring_create_from_utf8 ("u"); // accepted u
if (c == 0x0170) return wstring_create_from_utf8 ("U"); // accepted U
if (c == 0x0171) return wstring_create_from_utf8 ("u"); // accepted u
if (c == 0x0172) return wstring_create_from_utf8 ("U"); // accepted U
if (c == 0x0173) return wstring_create_from_utf8 ("u"); // accepted u
if (c == 0x0174) return wstring_create_from_utf8 ("W"); // accepted W
if (c == 0x0175) return wstring_create_from_utf8 ("w"); // accepted w
if (c == 0x0176) return wstring_create_from_utf8 ("Y"); // accepted Y
if (c == 0x0177) return wstring_create_from_utf8 ("y"); // accepted y
if (c == 0x0178) return wstring_create_from_utf8 ("Y"); // accepted Y
if (c == 0x00) return wstring_create_from_utf8 (""); //
wstring_append_c (ret, c);
}
else
wstring_append_c (ret, c);
return ret;
}
/*============================================================================
xhtml_translate_entity
============================================================================*/
WString *xhtml_translate_entity (const WString *entity)
{
/* Program flow in this function is very ugly, and prone to memory
leaks when modified. The whole thing needs to be rewritten */
char out[20];
IN
char *in = wstring_to_utf8 (entity);
if (strcasecmp (in, "amp") == 0)
strcpy (out, "&");
else if (strcasecmp (in, "nbsp") == 0)
strcpy (out, " ");
else if (strcasecmp (in, "lt") == 0)
strcpy (out, "<");
else if (strcasecmp (in, "gt") == 0)
strcpy (out, ">");
else if (strcasecmp (in, "cent") == 0)
strcpy (out, "¢");
else if (strcasecmp (in, "pound") == 0)
strcpy (out, "£");
else if (strcasecmp (in, "yen") == 0)
strcpy (out, "£");
else if (strcasecmp (in, "euro") == 0)
strcpy (out, "€");
else if (strcasecmp (in, "sect") == 0)
strcpy (out, "§");
else if (strcasecmp (in, "copy") == 0)
strcpy (out, "©");
else if (strcasecmp (in, "reg") == 0)
strcpy (out, "®");
else if (strcasecmp (in, "trade") == 0)
strcpy (out, "™");
else if (strcasecmp (in, "quot") == 0)
strcpy (out, "\"");
else if (in[0] == '#')
{
char *s = strdup (in);
s[0] = '0';
int v = 0;
if (sscanf (s, "%d", &v) == 1)
{
WString *ret = wstring_create_empty();
wstring_append_c (ret, (uint32_t)v);
OUT
free (s);
free (in);
return ret;
}
free (s);
}
else
{
strncpy (out, in, sizeof (out) - 1);
out[sizeof (out) - 1] = 0;
}
free (in);
OUT
return wstring_create_from_utf8 (out);
}
/*============================================================================
xhtml_flush_line
============================================================================*/
void xhtml_flush_line (const WString *para, const Epub2TxtOptions *options,
WrapTextContext *context)
{
IN
if (options->raw)
{
char *s = wstring_to_utf8 (para);
fputs (s, stdout);
free (s);
}
else
{
wraptext_wrap_utf32 (context, wstring_wstr (para));
wraptext_eof (context);
}
OUT
}
/*============================================================================
xhtml_flush_para
============================================================================*/
void xhtml_flush_para (const WString *para, const Epub2TxtOptions *options,
WrapTextContext *context)
{
IN
xhtml_flush_line (para, options, context);
OUT
}
/*============================================================================
xhtml_line_break
============================================================================*/
void xhtml_line_break (WrapTextContext *context)
{
IN
//static uint32_t s[2] = { '\n', 0 };
static uint32_t s[2] = { WT_HARD_LINE_BREAK, 0 };
wraptext_wrap_utf32 (context, s);
wraptext_eof (context);
OUT
}
/*============================================================================
xhtml_para_break
============================================================================*/
void xhtml_para_break (WrapTextContext *context,
const Epub2TxtOptions *options)
{
IN
static uint32_t s[3] = { '\n', '\n', 0 };
if (options->raw)
{
printf ("\n\n");
}
else
{
wraptext_wrap_utf32 (context, s);
}
OUT
}
/*============================================================================
xhtml_all_white
Note that, for the purpses of application logic, an empty string
is considered to be whitespace
============================================================================*/
BOOL xhtml_all_white (WString *s)
{
if (wstring_length (s) == 0) return TRUE;
return wstring_is_whitespace (s);
}
/*============================================================================
xhtml_utf8_to_stdout
============================================================================*/
void xhtml_utf8_to_stdout (const char *s, const Epub2TxtOptions *options,
char **error)
{
IN
char *ss;
// This is all a bit ugly. The entity translation is in
// xhtml_to_stdout, which expects something that looks like a viable
// XHTML file. There's no guarantee that the input to this function
// will actually be a full XHTML file, so we must wrap it in a body
// to fool xhtml_to_stdout. Ugh.
asprintf (&ss, "<body>%s</body>", s);
WString *sw = wstring_create_from_utf8 (ss);
xhtml_to_stdout (sw, options, error);
wstring_destroy (sw);
free (ss);
OUT
}
/*============================================================================
xhtml_file_to_stdout
============================================================================*/
void xhtml_file_to_stdout (const char *filename, const Epub2TxtOptions *options,
char **error)
{
IN
log_debug ("Process XHTML file %s", filename);
WString *s;
wstring_create_from_utf8_file (filename, &s, error);
if (*error == NULL)
{
xhtml_to_stdout (s, options, error);
wstring_destroy (s);
}
OUT
}
/*============================================================================
xhtml_to_stdout
============================================================================*/
void xhtml_to_stdout (const WString *s, const Epub2TxtOptions *options,
char **error)
{
IN
log_debug ("Process XHTML string");
typedef enum {MODE_ANY=0, MODE_INTAG = 1, MODE_ENTITY = 2} Mode;
if (TRUE)
{
int width;
if (options->width <= 0)
width = INT_MAX;
else
width = options->width - 1;
WrapTextContext *context = wraptext_context_new();
wraptext_context_set_width (context, width);
wraptext_context_set_app_opts (context, (void *)options);
Mode mode = MODE_ANY;
BOOL inbody = FALSE;
BOOL can_newline = FALSE;
WString *tag = wstring_create_empty();
WString *entity = wstring_create_empty();
WString *para = wstring_create_empty();
WString *ruby = wstring_create_empty();
BOOL inruby = FALSE;
int i, l = wstring_length (s);
uint32_t last_c = 0;
int taglen = 0;
const uint32_t *text = wstring_wstr (s);
for (i = 0; i < l; i++)
{
uint32_t c = text[i];
if (c == 13) // DOS EOL
continue;
if (c == 9) // Tab
c = ' ';
//printf ("c=%c %04x\n", (char)c, c);
if (mode == MODE_ANY && c == '<')
{
taglen = 0;
mode = MODE_INTAG;
}
else if (mode == MODE_ANY && c == '\n')
{
if (inbody)
{
if (last_c != ' ')
{
wstring_append_c (para, ' ');
}
}
}
else if (mode == MODE_ANY && c == '&')
{
mode = MODE_ENTITY;
}
else if (mode == MODE_ANY)
{
if (inbody)
{
if (c == ' ' && last_c == ' ')
{
}
else
{
WString *s = xhtml_transform_char (c, options->ascii);
wstring_append (inruby ? ruby : para, s);
wstring_destroy (s);
}
}
}
else if (mode == MODE_ENTITY && c == ';')
{
if (inbody)
{
WString *trans = xhtml_translate_entity (entity);
wstring_append (inruby ? ruby : para, trans);
wstring_destroy (trans);
}
wstring_clear (entity);
mode = MODE_ANY;
}
else if (mode == MODE_ENTITY)
{
wstring_append_c (entity, c);
}
else if (mode == MODE_INTAG && c == '>')
{
taglen = 0;
Format format = FORMAT_NONE;
char *ss_tag = wstring_to_utf8 (tag);
char *p = strchr (ss_tag, ' ');
if (p) *p = 0;
if (strcasecmp (ss_tag, "body") == 0)
{
inbody = TRUE;
}
else if (strcasecmp (ss_tag, "/body") == 0)
{
if (xhtml_all_white (para))
can_newline = FALSE;
else
can_newline = TRUE;
xhtml_flush_para (para, options, context);
wstring_clear (para);
if (can_newline)
{
xhtml_para_break (context, options);
can_newline = FALSE;
}
inbody = FALSE;
}
else if ((strcasecmp (ss_tag, "p/") == 0)
|| (strcasecmp (ss_tag, "/p") == 0))
{
if (inbody)
{
if (xhtml_all_white (para))
can_newline = FALSE;
else
{
can_newline = TRUE;
}
xhtml_flush_para (para, options, context);
wstring_clear (para);
if (can_newline)
{
xhtml_para_break (context, options);
can_newline = FALSE;
}
}
}
else if ((strcasecmp (ss_tag, "br/") == 0)
|| (strcasecmp (ss_tag, "br") == 0)
|| (strcasecmp (ss_tag, "br /") == 0))
{
if (inbody)
{
if (xhtml_all_white (para))
can_newline = FALSE;
else
can_newline = TRUE;
xhtml_flush_para (para, options, context);
wstring_clear (para);
if (can_newline)
{
xhtml_line_break (context);
can_newline = FALSE;
}
}
}
else if (xhtml_is_start_format_tag (ss_tag, &format))
{
if (inbody)
{
xhtml_flush_line (para, options, context);
wstring_clear (para);
xhtml_emit_format (options, format);
xhtml_set_format (options, format, context);
}
}
else if (xhtml_is_end_format_tag (ss_tag, &format))
{
if (inbody)
{
xhtml_flush_line (para, options, context);
xhtml_emit_format (options, format);
xhtml_set_format (options, format, context);
wstring_clear (para);
}
}
else if (xhtml_is_end_breaking_tag (ss_tag, &format))
{
xhtml_flush_line (para, options, context);
xhtml_emit_format (options, format);
xhtml_set_format (options, format, context);
wstring_clear (para);
xhtml_para_break (context, options);
}
else if (xhtml_is_start_breaking_tag (ss_tag, &format))
{
xhtml_flush_line (para, options, context);
wstring_clear (para);
xhtml_emit_format (options, format);
xhtml_set_format (options, format, context);
}
else if (strcasecmp(ss_tag, "ruby") == 0)
{
wstring_clear (ruby);
}
else if (strcasecmp(ss_tag, "/ruby") == 0)
{
// Append concatenated ruby annotations
wstring_append_c (para, '(');
wstring_append (para, ruby);
wstring_append_c (para, ')');
wstring_clear (ruby);
}
else if (strcasecmp(ss_tag, "rt") == 0)
{
// Start accumulating ruby annotations
inruby = TRUE;
}
else if (strcasecmp(ss_tag, "/rt") == 0)
{
inruby = FALSE;
}
free (ss_tag);
wstring_clear (tag);
mode = MODE_ANY;
}
else if (mode == MODE_INTAG)
{
taglen++;
// Bug #5 -- Added support to abort tag reading if tag > 1000
// characters. This is an arbitrary number, but it's larger than
// any tag that we can handle.
if (taglen > 1000)
{
while (i < l)
{
uint32_t c = text[i];
if (c == (uint32_t)'>')
{
wstring_clear (tag);
}
i++;
}
}
wstring_append_c (tag, c);
}
else
log_error ("Unexpected character %d in mode %d", c, mode);
last_c = c;
}
if (wstring_length (para) > 0)
xhtml_flush_para (para, options, context);
wstring_destroy (tag);
wstring_destroy (entity);
wstring_destroy (para);
wstring_destroy (ruby);
wraptext_eof (context);
wraptext_context_free (context);
}
OUT
}
Generated by GNU Enscript 1.6.6, and GophHub 1.3.