Raw File
1 /*============================================================================
2 epub2txt v2
3 wrap.c
4 Copyright (c)2020 Kevin Boone, GPL v3.0
5
6 This file contains general-purpose text string wrapping functions, that
7 work on 32-bit characters, so each character is a fixed length. This is
8 to avoid the problems with character length that tend to arise when
9 working with chars as UTF-8 bytes.
10 ============================================================================*/
11
12 #include <stdio.h>
13
14 #if !defined(__MACH__)
15 #include <malloc.h>
16 #endif
17
18 #include <string.h>
19 #include <stdlib.h>
20 #include "defs.h"
21 #include "wrap.h"
22 #include "convertutf.h"
23 #include "xhtml.h"
24
25 #define WT_STATE_START 0
26 #define WT_STATE_WORD 1
27 #define WT_STATE_WHITE 2
28
29 typedef struct _WrapTextContextPriv
30 {
31 WrapTextOutputFn outputFn;
32 int width;
33 int flags;
34 int state;
35 int column;
36 int white_count;
37 unsigned int fmt;
38 void *app_opts;
39 void *app_data;
40 BOOL blank_line;
41 WT_UTF32 last;
42 WT_UTF32 *token;
43 } WrapTextContextPriv;
44
45
46 /** Convert a single UTF32 character to a UTF8 representation, where
47 * the UTF8 is an array of characters terminated with a zero. The
48 * utf8 parameter must be a pointed to an array of WT_UTF8 (aka char)
49 * of at least WT_UTF8_MAX_BYTES size. */
50 void wraptext_context_utf32_char_to_utf8 (const uint32_t c, WT_UTF8* utf8)
51 {
52 WT_UTF32 _in = c;
53 const UTF32* in = (const UTF32 *) &_in;
54 int max_out = WT_UTF8_MAX_BYTES;
55 UTF8 *out = (UTF8 *)utf8;
56 memset (out, 0, max_out * sizeof (UTF8));
57 UTF8 *out_temp = out;
58
59 ConvertUTF32toUTF8 (&in, in + 1,
60 //&out_temp, out + max_out * 4, 0);
61 &out_temp, out + max_out, 0);
62 int len = out_temp - out;
63 utf8[len] = 0;
64 }
65
66
67 void _stdout_output_fn (void *app_data, WT_UTF32 c)
68 {
69 WT_UTF8 buff [WT_UTF8_MAX_BYTES];
70 wraptext_context_utf32_char_to_utf8 (c, buff);
71 fputs (buff, stdout);
72 }
73
74
75 static void _wraptext_append_token (WrapTextContext *context, const WT_UTF32 c)
76 {
77 WT_UTF32 *token = context->priv->token;
78 if (!token)
79 {
80 token = malloc (sizeof (WT_UTF32));
81 token[0] = 0;
82 }
83
84 int l = wraptext_utf32_length (token);
85
86 token = realloc (token, (l+2) * sizeof (WT_UTF32));
87
88 token [l] = c;
89 token [l+1] = 0;
90
91 context->priv->token = token;
92 }
93
94
95 // Whitespace other than newline
96 BOOL _wraptext_is_white (WT_UTF32 c)
97 {
98 if (c == 160) return TRUE; // nbsp
99 if (c == 32) return TRUE;
100 if (c == 9) return TRUE;
101 //TODO -- other unicode whitespace chars
102 return FALSE;
103 }
104
105 // Whitespace other than newline
106 BOOL _wraptext_is_all_white (const WT_UTF32 *s)
107 {
108 while (*s)
109 {
110 if (!_wraptext_is_white (*s)) return FALSE;
111 s++;
112 }
113 return TRUE;
114 }
115
116
117 // TODO -- detect other newline characters
118 BOOL _wraptext_is_newline (WT_UTF32 c)
119 {
120 if (c == 10) return TRUE;
121 return FALSE;
122 }
123
124
125 void _wraptext_emit_newline (WrapTextContext *context)
126 {
127 context->priv->outputFn (context->priv->app_data, (WT_UTF32)'\n');
128 }
129
130
131 void _wraptext_new_line (WrapTextContext *context)
132 {
133 _wraptext_emit_newline (context);
134 context->priv->column = 0;
135 }
136
137
138 void _wraptext_flush_string (WrapTextContext *context, WT_UTF32 *s)
139 {
140 int i, l = wraptext_utf32_length (s);
141
142 if (l + context->priv->column + 1 >= context->priv->width)
143 {
144 xhtml_emit_fmt_eol_pre (context); /* upcall: turn-off all ANSI highlghting before EOL */
145 _wraptext_emit_newline (context);
146 xhtml_emit_fmt_eol_post (context); /* upcall: restore ANSI highlighting after EOL */
147 context->priv->column = 0;
148 }
149
150 for (i = 0; i < l; i++)
151 {
152 WT_UTF32 c = s[i];
153 context->priv->outputFn (context->priv->app_data, c);
154 }
155
156 context->priv->column += l;
157 }
158
159
160 void _wraptext_flush_space (WrapTextContext *context, BOOL allowAtStart)
161 {
162 if ((context->priv->column > 0) || allowAtStart)
163 {
164 context->priv->outputFn (context->priv->app_data, ' ');
165 context->priv->column++;
166 }
167 }
168
169
170 void _wraptext_flush_token (WrapTextContext *context)
171 {
172 WT_UTF32 *token = context->priv->token;
173 // Don't flush anything -- even a space -- if the token is
174 // null. This will only happen at end-of-line or end-of-file
175 // states (hopefully)
176 if (token)
177 {
178 if (token[0])
179 {
180 if (!_wraptext_is_all_white (token))
181 context->priv->blank_line = FALSE;
182 }
183 _wraptext_flush_string (context, token);
184 _wraptext_flush_space (context, FALSE);
185 free (context->priv->token);
186 }
187
188 context->priv->token = NULL;
189 }
190
191
192 void _wraptext_wrap_next (WrapTextContext *context, const WT_UTF32 c)
193 {
194 WT_UTF32 last = context->priv->last;
195
196 int state = context->priv->state;
197
198 // This logic counts spaces at the ends of lines, so MD-style
199 // double-space linebreaks can be respected.
200 // NB -- not used in epub2txt
201 if (_wraptext_is_newline (c))
202 {
203 }
204 else
205 {
206 if (_wraptext_is_white (c))
207 context->priv->white_count++;
208 else
209 context->priv->white_count = 0;
210 }
211
212 // STATE_START
213
214 if (state == WT_STATE_START && _wraptext_is_newline (c))
215 {
216 //printf ("!");
217 // Double blank line -- respect this as a para separator
218 if (context->priv->blank_line)
219 {
220 }
221 else
222 {
223 _wraptext_new_line (context);
224 _wraptext_new_line (context);
225 context->priv->blank_line = TRUE;
226 }
227 state = WT_STATE_WHITE;
228 }
229 else if (state == WT_STATE_START && _wraptext_is_white (c))
230 {
231 // Space at the beginning of the line
232 // Do nothing yet TODO
233 }
234 else if (state == WT_STATE_START)
235 {
236 _wraptext_append_token (context, c);
237 state = WT_STATE_WORD;
238 }
239
240 // STATE_WORD
241
242 else if (state == WT_STATE_WORD && c == WT_HARD_LINE_BREAK)
243 {
244 _wraptext_flush_token (context);
245 _wraptext_new_line (context);
246 state = WT_STATE_START;
247 }
248 else if (state == WT_STATE_WORD && _wraptext_is_newline (c))
249 {
250 _wraptext_flush_token (context);
251 state = WT_STATE_START;
252 }
253 else if (state == WT_STATE_WORD && _wraptext_is_white (c))
254 {
255 _wraptext_flush_token (context);
256 state = WT_STATE_WHITE;
257 }
258 else if (state == WT_STATE_WORD)
259 {
260 _wraptext_append_token (context, c);
261 state = WT_STATE_WORD;
262 }
263
264 // STATE_WHITE
265
266 else if (state == WT_STATE_WHITE && _wraptext_is_newline (c))
267 {
268 _wraptext_flush_token (context);
269 state = WT_STATE_START;
270 }
271 else if (state == WT_STATE_WHITE && _wraptext_is_white (c))
272 {
273 state = WT_STATE_WHITE;
274 }
275 else if (state == WT_STATE_WHITE)
276 {
277 _wraptext_append_token (context, c);
278 state = WT_STATE_WORD;
279 }
280
281 // We should ever get here
282 else
283 {
284 fprintf (stderr, "Internal error: char %d in state %d\n", c, state);
285 exit (-1);
286 }
287
288 context->priv->last = last;
289 context->priv->state = state;
290 }
291
292
293 void wraptext_eof (WrapTextContext *context)
294 {
295 // Handle any input that has not been handled already
296 _wraptext_flush_token (context);
297 }
298
299
300 void wraptext_wrap_utf32 (WrapTextContext *context, const WT_UTF32 *utf32)
301 {
302 int i, len = wraptext_utf32_length (utf32);
303 for (i = 0; i < len; i++)
304 {
305 WT_UTF32 c = utf32[i];
306 _wraptext_wrap_next (context, c);
307 }
308 }
309
310
311 void wraptext_easy_stdout_utf32 (const int width, const WT_UTF32 *utf32,
312 int flags)
313 {
314 WrapTextContext *context = wraptext_context_new();
315 wraptext_context_set_output_fn (context, _stdout_output_fn);
316 wraptext_context_set_flags (context, flags);
317 wraptext_context_set_width (context, width);
318 wraptext_wrap_utf32 (context, utf32);
319 wraptext_eof (context);
320 wraptext_context_free (context);
321 }
322
323
324 WrapTextContext *wraptext_context_new (void)
325 {
326 WrapTextContext *self = malloc (sizeof (WrapTextContext));
327 memset (self, 0, sizeof (WrapTextContext));
328 WrapTextContextPriv *priv = malloc (sizeof (WrapTextContextPriv));
329 memset (priv, 0, sizeof (WrapTextContextPriv));
330 self->priv = priv;
331 self->priv->width = 80;
332 self->priv->blank_line = TRUE; // Assume that we are starting on a new line
333 self->priv->outputFn = _stdout_output_fn;
334 wraptext_context_reset (self);
335 return self;
336 }
337
338
339 void wraptext_context_reset (WrapTextContext *self)
340 {
341 self->priv->state = WT_STATE_START;
342 self->priv->column = 0;
343 self->priv->last = 0;
344 self->priv->white_count = 0;
345 self->priv->fmt = 0;
346 self->priv->blank_line = TRUE;
347 if (self->priv->token) free (self->priv->token);
348 self->priv->token = NULL;
349 }
350
351
352 void wraptext_context_set_output_fn (WrapTextContext *self,
353 WrapTextOutputFn fn)
354 {
355 self->priv->outputFn = fn;
356 }
357
358
359 void wraptext_context_set_width (WrapTextContext *self, int width)
360 {
361 self->priv->width = width;
362 }
363
364 void wraptext_context_set_flags (WrapTextContext *self, int flags)
365 {
366 self->priv->flags = flags;
367 }
368
369 void wraptext_context_zero_fmt (WrapTextContext *self)
370 {
371 self->priv->fmt = 0;
372 }
373
374 unsigned int wraptext_context_get_fmt (WrapTextContext *self)
375 {
376 return self->priv->fmt;
377 }
378
379 void wraptext_context_set_fmt (WrapTextContext *self, unsigned int fmt)
380 {
381 self->priv->fmt |= fmt;
382 }
383
384 void wraptext_context_reset_fmt (WrapTextContext *self, unsigned int fmt)
385 {
386 self->priv->fmt &= ~fmt;
387 }
388
389 void wraptext_context_set_app_opts (WrapTextContext *self, void *app_opts)
390 {
391 self->priv->app_opts = app_opts;
392 }
393
394 void *wraptext_context_get_app_opts (WrapTextContext *self)
395 {
396 return self->priv->app_opts;
397 }
398
399 void wraptext_context_set_app_data (WrapTextContext *self, void *app_data)
400 {
401 self->priv->app_data = app_data;
402 }
403
404 void wraptext_context_free (WrapTextContext *self)
405 {
406 if (!self) return;
407 if (self->priv)
408 {
409 free (self->priv);
410 self->priv = NULL;
411 }
412 free (self);
413 }
414
415
416 const int wraptext_utf32_length (const WT_UTF32 *s)
417 {
418 if (!s) return 0;
419 int i = 0;
420 WT_UTF32 c = 0;
421 do
422 {
423 c = s[i];
424 i++;
425 } while (c != 0);
426 return i - 1;
427 }
428
429
430
431
Generated by GNU Enscript 1.6.6, and GophHub 1.3.