Raw File
/*============================================================================
epub2txt v2
wrap.c
Copyright (c)2020 Kevin Boone, GPL v3.0
This file contains general-purpose text string wrapping functions, that
work on 32-bit characters, so each character is a fixed length. This is
to avoid the problems with character length that tend to arise when
working with chars as UTF-8 bytes.
============================================================================*/
#include <stdio.h>
#if !defined(__MACH__)
#include <malloc.h>
#endif
#include <string.h>
#include <stdlib.h>
#include "defs.h"
#include "wrap.h"
#include "convertutf.h"
#include "xhtml.h"
#define WT_STATE_START 0
#define WT_STATE_WORD 1
#define WT_STATE_WHITE 2
typedef struct _WrapTextContextPriv
{
WrapTextOutputFn outputFn;
int width;
int flags;
int state;
int column;
int white_count;
unsigned int fmt;
void *app_opts;
void *app_data;
BOOL blank_line;
WT_UTF32 last;
WT_UTF32 *token;
} WrapTextContextPriv;
/** Convert a single UTF32 character to a UTF8 representation, where
* the UTF8 is an array of characters terminated with a zero. The
* utf8 parameter must be a pointed to an array of WT_UTF8 (aka char)
* of at least WT_UTF8_MAX_BYTES size. */
void wraptext_context_utf32_char_to_utf8 (const uint32_t c, WT_UTF8* utf8)
{
WT_UTF32 _in = c;
const UTF32* in = (const UTF32 *) &_in;
int max_out = WT_UTF8_MAX_BYTES;
UTF8 *out = (UTF8 *)utf8;
memset (out, 0, max_out * sizeof (UTF8));
UTF8 *out_temp = out;
ConvertUTF32toUTF8 (&in, in + 1,
//&out_temp, out + max_out * 4, 0);
&out_temp, out + max_out, 0);
int len = out_temp - out;
utf8[len] = 0;
}
void _stdout_output_fn (void *app_data, WT_UTF32 c)
{
WT_UTF8 buff [WT_UTF8_MAX_BYTES];
wraptext_context_utf32_char_to_utf8 (c, buff);
fputs (buff, stdout);
}
static void _wraptext_append_token (WrapTextContext *context, const WT_UTF32 c)
{
WT_UTF32 *token = context->priv->token;
if (!token)
{
token = malloc (sizeof (WT_UTF32));
token[0] = 0;
}
int l = wraptext_utf32_length (token);
token = realloc (token, (l+2) * sizeof (WT_UTF32));
token [l] = c;
token [l+1] = 0;
context->priv->token = token;
}
// Whitespace other than newline
BOOL _wraptext_is_white (WT_UTF32 c)
{
if (c == 160) return TRUE; // nbsp
if (c == 32) return TRUE;
if (c == 9) return TRUE;
//TODO -- other unicode whitespace chars
return FALSE;
}
// Whitespace other than newline
BOOL _wraptext_is_all_white (const WT_UTF32 *s)
{
while (*s)
{
if (!_wraptext_is_white (*s)) return FALSE;
s++;
}
return TRUE;
}
// TODO -- detect other newline characters
BOOL _wraptext_is_newline (WT_UTF32 c)
{
if (c == 10) return TRUE;
return FALSE;
}
void _wraptext_emit_newline (WrapTextContext *context)
{
context->priv->outputFn (context->priv->app_data, (WT_UTF32)'\n');
}
void _wraptext_new_line (WrapTextContext *context)
{
_wraptext_emit_newline (context);
context->priv->column = 0;
}
void _wraptext_flush_string (WrapTextContext *context, WT_UTF32 *s)
{
int i, l = wraptext_utf32_length (s);
if (l + context->priv->column + 1 >= context->priv->width)
{
xhtml_emit_fmt_eol_pre (context); /* upcall: turn-off all ANSI highlghting before EOL */
_wraptext_emit_newline (context);
xhtml_emit_fmt_eol_post (context); /* upcall: restore ANSI highlighting after EOL */
context->priv->column = 0;
}
for (i = 0; i < l; i++)
{
WT_UTF32 c = s[i];
context->priv->outputFn (context->priv->app_data, c);
}
context->priv->column += l;
}
void _wraptext_flush_space (WrapTextContext *context, BOOL allowAtStart)
{
if ((context->priv->column > 0) || allowAtStart)
{
context->priv->outputFn (context->priv->app_data, ' ');
context->priv->column++;
}
}
void _wraptext_flush_token (WrapTextContext *context)
{
WT_UTF32 *token = context->priv->token;
// Don't flush anything -- even a space -- if the token is
// null. This will only happen at end-of-line or end-of-file
// states (hopefully)
if (token)
{
if (token[0])
{
if (!_wraptext_is_all_white (token))
context->priv->blank_line = FALSE;
}
_wraptext_flush_string (context, token);
_wraptext_flush_space (context, FALSE);
free (context->priv->token);
}
context->priv->token = NULL;
}
void _wraptext_wrap_next (WrapTextContext *context, const WT_UTF32 c)
{
WT_UTF32 last = context->priv->last;
int state = context->priv->state;
// This logic counts spaces at the ends of lines, so MD-style
// double-space linebreaks can be respected.
// NB -- not used in epub2txt
if (_wraptext_is_newline (c))
{
}
else
{
if (_wraptext_is_white (c))
context->priv->white_count++;
else
context->priv->white_count = 0;
}
// STATE_START
if (state == WT_STATE_START && _wraptext_is_newline (c))
{
//printf ("!");
// Double blank line -- respect this as a para separator
if (context->priv->blank_line)
{
}
else
{
_wraptext_new_line (context);
_wraptext_new_line (context);
context->priv->blank_line = TRUE;
}
state = WT_STATE_WHITE;
}
else if (state == WT_STATE_START && _wraptext_is_white (c))
{
// Space at the beginning of the line
// Do nothing yet TODO
}
else if (state == WT_STATE_START)
{
_wraptext_append_token (context, c);
state = WT_STATE_WORD;
}
// STATE_WORD
else if (state == WT_STATE_WORD && c == WT_HARD_LINE_BREAK)
{
_wraptext_flush_token (context);
_wraptext_new_line (context);
state = WT_STATE_START;
}
else if (state == WT_STATE_WORD && _wraptext_is_newline (c))
{
_wraptext_flush_token (context);
state = WT_STATE_START;
}
else if (state == WT_STATE_WORD && _wraptext_is_white (c))
{
_wraptext_flush_token (context);
state = WT_STATE_WHITE;
}
else if (state == WT_STATE_WORD)
{
_wraptext_append_token (context, c);
state = WT_STATE_WORD;
}
// STATE_WHITE
else if (state == WT_STATE_WHITE && _wraptext_is_newline (c))
{
_wraptext_flush_token (context);
state = WT_STATE_START;
}
else if (state == WT_STATE_WHITE && _wraptext_is_white (c))
{
state = WT_STATE_WHITE;
}
else if (state == WT_STATE_WHITE)
{
_wraptext_append_token (context, c);
state = WT_STATE_WORD;
}
// We should ever get here
else
{
fprintf (stderr, "Internal error: char %d in state %d\n", c, state);
exit (-1);
}
context->priv->last = last;
context->priv->state = state;
}
void wraptext_eof (WrapTextContext *context)
{
// Handle any input that has not been handled already
_wraptext_flush_token (context);
}
void wraptext_wrap_utf32 (WrapTextContext *context, const WT_UTF32 *utf32)
{
int i, len = wraptext_utf32_length (utf32);
for (i = 0; i < len; i++)
{
WT_UTF32 c = utf32[i];
_wraptext_wrap_next (context, c);
}
}
void wraptext_easy_stdout_utf32 (const int width, const WT_UTF32 *utf32,
int flags)
{
WrapTextContext *context = wraptext_context_new();
wraptext_context_set_output_fn (context, _stdout_output_fn);
wraptext_context_set_flags (context, flags);
wraptext_context_set_width (context, width);
wraptext_wrap_utf32 (context, utf32);
wraptext_eof (context);
wraptext_context_free (context);
}
WrapTextContext *wraptext_context_new (void)
{
WrapTextContext *self = malloc (sizeof (WrapTextContext));
memset (self, 0, sizeof (WrapTextContext));
WrapTextContextPriv *priv = malloc (sizeof (WrapTextContextPriv));
memset (priv, 0, sizeof (WrapTextContextPriv));
self->priv = priv;
self->priv->width = 80;
self->priv->blank_line = TRUE; // Assume that we are starting on a new line
self->priv->outputFn = _stdout_output_fn;
wraptext_context_reset (self);
return self;
}
void wraptext_context_reset (WrapTextContext *self)
{
self->priv->state = WT_STATE_START;
self->priv->column = 0;
self->priv->last = 0;
self->priv->white_count = 0;
self->priv->fmt = 0;
self->priv->blank_line = TRUE;
if (self->priv->token) free (self->priv->token);
self->priv->token = NULL;
}
void wraptext_context_set_output_fn (WrapTextContext *self,
WrapTextOutputFn fn)
{
self->priv->outputFn = fn;
}
void wraptext_context_set_width (WrapTextContext *self, int width)
{
self->priv->width = width;
}
void wraptext_context_set_flags (WrapTextContext *self, int flags)
{
self->priv->flags = flags;
}
void wraptext_context_zero_fmt (WrapTextContext *self)
{
self->priv->fmt = 0;
}
unsigned int wraptext_context_get_fmt (WrapTextContext *self)
{
return self->priv->fmt;
}
void wraptext_context_set_fmt (WrapTextContext *self, unsigned int fmt)
{
self->priv->fmt |= fmt;
}
void wraptext_context_reset_fmt (WrapTextContext *self, unsigned int fmt)
{
self->priv->fmt &= ~fmt;
}
void wraptext_context_set_app_opts (WrapTextContext *self, void *app_opts)
{
self->priv->app_opts = app_opts;
}
void *wraptext_context_get_app_opts (WrapTextContext *self)
{
return self->priv->app_opts;
}
void wraptext_context_set_app_data (WrapTextContext *self, void *app_data)
{
self->priv->app_data = app_data;
}
void wraptext_context_free (WrapTextContext *self)
{
if (!self) return;
if (self->priv)
{
free (self->priv);
self->priv = NULL;
}
free (self);
}
const int wraptext_utf32_length (const WT_UTF32 *s)
{
if (!s) return 0;
int i = 0;
WT_UTF32 c = 0;
do
{
c = s[i];
i++;
} while (c != 0);
return i - 1;
}
Generated by GNU Enscript 1.6.6, and GophHub 1.3.