GophHub - kevinboone/epub2txt2/src/wrap.c


Raw File

    1	/*============================================================================
    2	  epub2txt v2 
    3	  wrap.c
    4	  Copyright (c)2020 Kevin Boone, GPL v3.0
    5	
    6	  This file contains general-purpose text string wrapping functions, that
    7	  work on 32-bit characters, so each character is a fixed length. This is
    8	  to avoid the problems with character length that tend to arise when 
    9	  working with chars as UTF-8 bytes.
   10	============================================================================*/
   11	
   12	#include <stdio.h>
   13	
   14	#if !defined(__MACH__)
   15	#include <malloc.h>
   16	#endif
   17	
   18	#include <string.h>
   19	#include <stdlib.h>
   20	#include "defs.h" 
   21	#include "wrap.h"
   22	#include "convertutf.h"
   23	#include "xhtml.h"
   24	
   25	#define WT_STATE_START 0
   26	#define WT_STATE_WORD 1
   27	#define WT_STATE_WHITE 2
   28	
   29	typedef struct _WrapTextContextPriv 
   30	  {
   31	  WrapTextOutputFn outputFn;
   32	  int width;
   33	  int flags;
   34	  int state;
   35	  int column;
   36	  int white_count;
   37	  unsigned int fmt;
   38	  void *app_opts;
   39	  void *app_data;
   40	  BOOL blank_line;
   41	  WT_UTF32 last;
   42	  WT_UTF32 *token;
   43	  } WrapTextContextPriv;
   44	
   45	
   46	/** Convert a single UTF32 character to a UTF8 representation, where
   47	 * the UTF8 is an array of characters terminated with a zero. The 
   48	 * utf8 parameter must be a pointed to an array of WT_UTF8 (aka char)
   49	 * of at least WT_UTF8_MAX_BYTES size. */
   50	void wraptext_context_utf32_char_to_utf8 (const uint32_t c, WT_UTF8* utf8)
   51	  {
   52	  WT_UTF32 _in = c;
   53	  const UTF32* in = (const UTF32 *) &_in;
   54	  int max_out = WT_UTF8_MAX_BYTES;
   55	  UTF8 *out = (UTF8 *)utf8;
   56	  memset (out, 0, max_out * sizeof (UTF8));
   57	  UTF8 *out_temp = out;
   58	
   59	  ConvertUTF32toUTF8 (&in, in + 1,
   60	      //&out_temp, out + max_out * 4, 0);
   61	      &out_temp, out + max_out, 0);
   62	  int len = out_temp - out;
   63	  utf8[len] = 0;
   64	  }
   65	
   66	
   67	void _stdout_output_fn (void *app_data, WT_UTF32 c)
   68	  {
   69	  WT_UTF8 buff [WT_UTF8_MAX_BYTES];  
   70	  wraptext_context_utf32_char_to_utf8 (c, buff);
   71	  fputs (buff, stdout); 
   72	  }
   73	
   74	
   75	static void _wraptext_append_token (WrapTextContext *context, const WT_UTF32 c)
   76	  {
   77	  WT_UTF32 *token = context->priv->token;
   78	  if (!token)
   79	    {
   80	    token = malloc (sizeof (WT_UTF32));
   81	    token[0] = 0;
   82	    }
   83	   
   84	  int l = wraptext_utf32_length (token);
   85	   
   86	  token = realloc (token, (l+2) * sizeof (WT_UTF32));
   87	
   88	  token [l] = c;
   89	  token [l+1] = 0;
   90	
   91	  context->priv->token = token;
   92	  }
   93	
   94	
   95	// Whitespace other than newline
   96	BOOL _wraptext_is_white (WT_UTF32 c)
   97	  {
   98	  if (c == 160) return TRUE; // nbsp
   99	  if (c == 32) return TRUE;
  100	  if (c == 9) return TRUE;
  101	  //TODO -- other unicode whitespace chars
  102	  return FALSE;
  103	  }
  104	
  105	// Whitespace other than newline
  106	BOOL _wraptext_is_all_white (const WT_UTF32 *s)
  107	  {
  108	  while (*s)
  109	    {
  110	    if (!_wraptext_is_white (*s)) return FALSE;
  111	    s++;
  112	    }
  113	  return TRUE;
  114	  }
  115	
  116	
  117	// TODO -- detect other newline characters 
  118	BOOL _wraptext_is_newline (WT_UTF32 c)
  119	  {
  120	  if (c == 10) return TRUE;
  121	  return FALSE;
  122	  }
  123	
  124	
  125	void _wraptext_emit_newline (WrapTextContext *context)
  126	  {
  127	  context->priv->outputFn (context->priv->app_data, (WT_UTF32)'\n'); 
  128	  }
  129	
  130	
  131	void _wraptext_new_line (WrapTextContext *context)
  132	  {
  133	  _wraptext_emit_newline (context);
  134	  context->priv->column = 0;
  135	  }
  136	
  137	
  138	void _wraptext_flush_string (WrapTextContext *context, WT_UTF32 *s)
  139	  {
  140	  int i, l = wraptext_utf32_length (s);
  141	
  142	  if (l + context->priv->column + 1 >= context->priv->width)
  143	    {
  144	    xhtml_emit_fmt_eol_pre (context);    /* upcall: turn-off all ANSI highlghting before EOL */
  145	    _wraptext_emit_newline (context);
  146	    xhtml_emit_fmt_eol_post (context);   /* upcall: restore ANSI highlighting after EOL */
  147	    context->priv->column = 0;
  148	    }
  149	 
  150	  for (i = 0; i < l; i++)
  151	    {
  152	    WT_UTF32 c = s[i];
  153	    context->priv->outputFn (context->priv->app_data, c); 
  154	    }
  155	
  156	  context->priv->column += l;
  157	  }
  158	
  159	
  160	void _wraptext_flush_space (WrapTextContext *context, BOOL allowAtStart)
  161	  {
  162	  if ((context->priv->column > 0) || allowAtStart)
  163	    {
  164	    context->priv->outputFn (context->priv->app_data, ' '); 
  165	    context->priv->column++;
  166	    }
  167	  }
  168	
  169	
  170	void _wraptext_flush_token (WrapTextContext *context)
  171	  {
  172	  WT_UTF32 *token = context->priv->token;
  173	  // Don't flush anything -- even a space -- if the token is
  174	  //  null. This will only happen at end-of-line or end-of-file
  175	  //  states (hopefully)
  176	  if (token)
  177	    {
  178	    if (token[0])
  179	      {
  180	      if (!_wraptext_is_all_white (token))
  181	        context->priv->blank_line = FALSE;
  182	      }
  183	    _wraptext_flush_string (context, token);
  184	    _wraptext_flush_space (context, FALSE);
  185	    free (context->priv->token);
  186	    }
  187	
  188	  context->priv->token = NULL;
  189	  }
  190	
  191	
  192	void _wraptext_wrap_next (WrapTextContext *context, const WT_UTF32 c)
  193	  {
  194	  WT_UTF32 last = context->priv->last;
  195	
  196	  int state = context->priv->state;
  197	
  198	  // This logic counts spaces at the ends of lines, so MD-style
  199	  //   double-space linebreaks can be respected.
  200	  // NB -- not used in epub2txt
  201	  if (_wraptext_is_newline (c))
  202	    {
  203	    }
  204	  else
  205	    {
  206	    if (_wraptext_is_white (c))
  207	      context->priv->white_count++;
  208	    else
  209	      context->priv->white_count = 0;
  210	    }
  211	  
  212	  // STATE_START
  213	
  214	  if (state == WT_STATE_START && _wraptext_is_newline (c))
  215	     {
  216	     //printf ("!");
  217	     // Double blank line -- respect this as a para separator
  218	     if (context->priv->blank_line)
  219	       {
  220	       }
  221	     else
  222	       {
  223	       _wraptext_new_line (context); 
  224	       _wraptext_new_line (context); 
  225	       context->priv->blank_line = TRUE;
  226	       }
  227	     state = WT_STATE_WHITE;
  228	     }
  229	  else if (state == WT_STATE_START && _wraptext_is_white (c))
  230	     {
  231	     // Space at the beginning of the line
  232	     // Do nothing yet TODO
  233	     }
  234	  else if (state == WT_STATE_START)
  235	     {
  236	     _wraptext_append_token (context, c);
  237	     state = WT_STATE_WORD;
  238	     }
  239	
  240	  // STATE_WORD
  241	
  242	  else if (state == WT_STATE_WORD && c == WT_HARD_LINE_BREAK)
  243	     {
  244	     _wraptext_flush_token (context);
  245	     _wraptext_new_line (context);
  246	     state = WT_STATE_START;
  247	     }
  248	  else if (state == WT_STATE_WORD && _wraptext_is_newline (c))
  249	     {
  250	     _wraptext_flush_token (context);
  251	     state = WT_STATE_START;
  252	     }
  253	  else if (state == WT_STATE_WORD && _wraptext_is_white (c))
  254	     {
  255	     _wraptext_flush_token (context);
  256	     state = WT_STATE_WHITE;
  257	     }
  258	  else if (state == WT_STATE_WORD)
  259	     {
  260	     _wraptext_append_token (context, c);
  261	     state = WT_STATE_WORD;
  262	     }
  263	  
  264	  // STATE_WHITE
  265	
  266	  else if (state == WT_STATE_WHITE && _wraptext_is_newline (c))
  267	     {
  268	     _wraptext_flush_token (context);
  269	     state = WT_STATE_START;
  270	     }
  271	  else if (state == WT_STATE_WHITE && _wraptext_is_white (c))
  272	     {
  273	     state = WT_STATE_WHITE;
  274	     }
  275	  else if (state == WT_STATE_WHITE)
  276	     {
  277	     _wraptext_append_token (context, c);
  278	     state = WT_STATE_WORD;
  279	     }
  280	  
  281	  // We should ever get here
  282	  else
  283	     {
  284	     fprintf (stderr, "Internal error: char %d in state %d\n", c, state);
  285	     exit (-1);
  286	     }
  287	
  288	  context->priv->last = last;
  289	  context->priv->state = state;
  290	  }
  291	
  292	
  293	void wraptext_eof (WrapTextContext *context)
  294	  {
  295	  // Handle any input that has not been handled already
  296	  _wraptext_flush_token (context);
  297	  }
  298	
  299	
  300	void wraptext_wrap_utf32 (WrapTextContext *context, const WT_UTF32 *utf32)
  301	  {
  302	  int i, len = wraptext_utf32_length (utf32);
  303	  for (i = 0; i < len; i++)
  304	    {
  305	    WT_UTF32 c = utf32[i];
  306	    _wraptext_wrap_next (context, c);
  307	    }
  308	  }
  309	
  310	
  311	void wraptext_easy_stdout_utf32 (const int width, const WT_UTF32 *utf32,
  312	     int flags)
  313	  {
  314	  WrapTextContext *context = wraptext_context_new();
  315	  wraptext_context_set_output_fn (context, _stdout_output_fn);
  316	  wraptext_context_set_flags (context, flags);
  317	  wraptext_context_set_width (context, width);
  318	  wraptext_wrap_utf32 (context, utf32);
  319	  wraptext_eof (context);
  320	  wraptext_context_free (context);
  321	  }
  322	
  323	
  324	WrapTextContext *wraptext_context_new (void)
  325	  {
  326	  WrapTextContext *self = malloc (sizeof (WrapTextContext));
  327	  memset (self, 0, sizeof (WrapTextContext));
  328	  WrapTextContextPriv *priv = malloc (sizeof (WrapTextContextPriv));
  329	  memset (priv, 0, sizeof (WrapTextContextPriv));
  330	  self->priv = priv;
  331	  self->priv->width = 80;
  332	  self->priv->blank_line = TRUE; // Assume that we are starting on a new line
  333	  self->priv->outputFn = _stdout_output_fn;
  334	  wraptext_context_reset (self);
  335	  return self;
  336	  }
  337	
  338	
  339	void wraptext_context_reset (WrapTextContext *self)
  340	  {
  341	  self->priv->state = WT_STATE_START;
  342	  self->priv->column = 0;
  343	  self->priv->last = 0;
  344	  self->priv->white_count = 0;
  345	  self->priv->fmt = 0;
  346	  self->priv->blank_line = TRUE;
  347	  if (self->priv->token) free (self->priv->token);
  348	  self->priv->token = NULL;
  349	  }
  350	
  351	
  352	void wraptext_context_set_output_fn (WrapTextContext *self, 
  353	    WrapTextOutputFn fn)
  354	  {
  355	  self->priv->outputFn = fn;
  356	  }
  357	
  358	
  359	void wraptext_context_set_width (WrapTextContext *self, int width)
  360	  {
  361	  self->priv->width = width;
  362	  }
  363	
  364	void wraptext_context_set_flags (WrapTextContext *self, int flags)
  365	  {
  366	  self->priv->flags = flags;
  367	  }
  368	
  369	void wraptext_context_zero_fmt (WrapTextContext *self)
  370	  {
  371	  self->priv->fmt = 0;
  372	  }
  373	
  374	unsigned int wraptext_context_get_fmt (WrapTextContext *self)
  375	  {
  376	  return self->priv->fmt;
  377	  }
  378	
  379	void wraptext_context_set_fmt (WrapTextContext *self, unsigned int fmt)
  380	  {
  381	  self->priv->fmt |= fmt;
  382	  }
  383	
  384	void wraptext_context_reset_fmt (WrapTextContext *self, unsigned int fmt)
  385	  {
  386	  self->priv->fmt &= ~fmt;
  387	  }
  388	
  389	void wraptext_context_set_app_opts (WrapTextContext *self, void *app_opts)
  390	  {
  391	  self->priv->app_opts = app_opts;
  392	  }
  393	
  394	void *wraptext_context_get_app_opts (WrapTextContext *self)
  395	  {
  396	  return self->priv->app_opts;
  397	  }
  398	
  399	void wraptext_context_set_app_data (WrapTextContext *self, void *app_data)
  400	  {
  401	  self->priv->app_data = app_data;
  402	  }
  403	
  404	void wraptext_context_free (WrapTextContext *self)
  405	  {
  406	  if (!self) return;
  407	  if (self->priv)
  408	    {
  409	    free (self->priv);
  410	    self->priv = NULL;
  411	    }
  412	  free (self);
  413	  }
  414	
  415	
  416	const int wraptext_utf32_length (const WT_UTF32 *s)
  417	  {
  418	  if (!s) return 0;
  419	  int i = 0;
  420	  WT_UTF32 c = 0;
  421	  do
  422	    {
  423	    c = s[i];
  424	    i++;
  425	    } while (c != 0);
  426	  return i - 1;
  427	  }
  428	
  429	
  430	
  431	

Generated by GNU Enscript 1.6.6, and GophHub 1.3.