1 /*============================================================================ 2 epub2txt v2 3 wrap.c 4 Copyright (c)2020 Kevin Boone, GPL v3.0 5 6 This file contains general-purpose text string wrapping functions, that 7 work on 32-bit characters, so each character is a fixed length. This is 8 to avoid the problems with character length that tend to arise when 9 working with chars as UTF-8 bytes. 10 ============================================================================*/ 11 12 #include 13 14 #if !defined(__MACH__) 15 #include 16 #endif 17 18 #include 19 #include 20 #include "defs.h" 21 #include "wrap.h" 22 #include "convertutf.h" 23 #include "xhtml.h" 24 25 #define WT_STATE_START 0 26 #define WT_STATE_WORD 1 27 #define WT_STATE_WHITE 2 28 29 typedef struct _WrapTextContextPriv 30 { 31 WrapTextOutputFn outputFn; 32 int width; 33 int flags; 34 int state; 35 int column; 36 int white_count; 37 unsigned int fmt; 38 void *app_opts; 39 void *app_data; 40 BOOL blank_line; 41 WT_UTF32 last; 42 WT_UTF32 *token; 43 } WrapTextContextPriv; 44 45 46 /** Convert a single UTF32 character to a UTF8 representation, where 47 * the UTF8 is an array of characters terminated with a zero. The 48 * utf8 parameter must be a pointed to an array of WT_UTF8 (aka char) 49 * of at least WT_UTF8_MAX_BYTES size. */ 50 void wraptext_context_utf32_char_to_utf8 (const uint32_t c, WT_UTF8* utf8) 51 { 52 WT_UTF32 _in = c; 53 const UTF32* in = (const UTF32 *) &_in; 54 int max_out = WT_UTF8_MAX_BYTES; 55 UTF8 *out = (UTF8 *)utf8; 56 memset (out, 0, max_out * sizeof (UTF8)); 57 UTF8 *out_temp = out; 58 59 ConvertUTF32toUTF8 (&in, in + 1, 60 //&out_temp, out + max_out * 4, 0); 61 &out_temp, out + max_out, 0); 62 int len = out_temp - out; 63 utf8[len] = 0; 64 } 65 66 67 void _stdout_output_fn (void *app_data, WT_UTF32 c) 68 { 69 WT_UTF8 buff [WT_UTF8_MAX_BYTES]; 70 wraptext_context_utf32_char_to_utf8 (c, buff); 71 fputs (buff, stdout); 72 } 73 74 75 static void _wraptext_append_token (WrapTextContext *context, const WT_UTF32 c) 76 { 77 WT_UTF32 *token = context->priv->token; 78 if (!token) 79 { 80 token = malloc (sizeof (WT_UTF32)); 81 token[0] = 0; 82 } 83 84 int l = wraptext_utf32_length (token); 85 86 token = realloc (token, (l+2) * sizeof (WT_UTF32)); 87 88 token [l] = c; 89 token [l+1] = 0; 90 91 context->priv->token = token; 92 } 93 94 95 // Whitespace other than newline 96 BOOL _wraptext_is_white (WT_UTF32 c) 97 { 98 if (c == 160) return TRUE; // nbsp 99 if (c == 32) return TRUE; 100 if (c == 9) return TRUE; 101 //TODO -- other unicode whitespace chars 102 return FALSE; 103 } 104 105 // Whitespace other than newline 106 BOOL _wraptext_is_all_white (const WT_UTF32 *s) 107 { 108 while (*s) 109 { 110 if (!_wraptext_is_white (*s)) return FALSE; 111 s++; 112 } 113 return TRUE; 114 } 115 116 117 // TODO -- detect other newline characters 118 BOOL _wraptext_is_newline (WT_UTF32 c) 119 { 120 if (c == 10) return TRUE; 121 return FALSE; 122 } 123 124 125 void _wraptext_emit_newline (WrapTextContext *context) 126 { 127 context->priv->outputFn (context->priv->app_data, (WT_UTF32)'\n'); 128 } 129 130 131 void _wraptext_new_line (WrapTextContext *context) 132 { 133 _wraptext_emit_newline (context); 134 context->priv->column = 0; 135 } 136 137 138 void _wraptext_flush_string (WrapTextContext *context, WT_UTF32 *s) 139 { 140 int i, l = wraptext_utf32_length (s); 141 142 if (l + context->priv->column + 1 >= context->priv->width) 143 { 144 xhtml_emit_fmt_eol_pre (context); /* upcall: turn-off all ANSI highlghting before EOL */ 145 _wraptext_emit_newline (context); 146 xhtml_emit_fmt_eol_post (context); /* upcall: restore ANSI highlighting after EOL */ 147 context->priv->column = 0; 148 } 149 150 for (i = 0; i < l; i++) 151 { 152 WT_UTF32 c = s[i]; 153 context->priv->outputFn (context->priv->app_data, c); 154 } 155 156 context->priv->column += l; 157 } 158 159 160 void _wraptext_flush_space (WrapTextContext *context, BOOL allowAtStart) 161 { 162 if ((context->priv->column > 0) || allowAtStart) 163 { 164 context->priv->outputFn (context->priv->app_data, ' '); 165 context->priv->column++; 166 } 167 } 168 169 170 void _wraptext_flush_token (WrapTextContext *context) 171 { 172 WT_UTF32 *token = context->priv->token; 173 // Don't flush anything -- even a space -- if the token is 174 // null. This will only happen at end-of-line or end-of-file 175 // states (hopefully) 176 if (token) 177 { 178 if (token[0]) 179 { 180 if (!_wraptext_is_all_white (token)) 181 context->priv->blank_line = FALSE; 182 } 183 _wraptext_flush_string (context, token); 184 _wraptext_flush_space (context, FALSE); 185 free (context->priv->token); 186 } 187 188 context->priv->token = NULL; 189 } 190 191 192 void _wraptext_wrap_next (WrapTextContext *context, const WT_UTF32 c) 193 { 194 WT_UTF32 last = context->priv->last; 195 196 int state = context->priv->state; 197 198 // This logic counts spaces at the ends of lines, so MD-style 199 // double-space linebreaks can be respected. 200 // NB -- not used in epub2txt 201 if (_wraptext_is_newline (c)) 202 { 203 } 204 else 205 { 206 if (_wraptext_is_white (c)) 207 context->priv->white_count++; 208 else 209 context->priv->white_count = 0; 210 } 211 212 // STATE_START 213 214 if (state == WT_STATE_START && _wraptext_is_newline (c)) 215 { 216 //printf ("!"); 217 // Double blank line -- respect this as a para separator 218 if (context->priv->blank_line) 219 { 220 } 221 else 222 { 223 _wraptext_new_line (context); 224 _wraptext_new_line (context); 225 context->priv->blank_line = TRUE; 226 } 227 state = WT_STATE_WHITE; 228 } 229 else if (state == WT_STATE_START && _wraptext_is_white (c)) 230 { 231 // Space at the beginning of the line 232 // Do nothing yet TODO 233 } 234 else if (state == WT_STATE_START) 235 { 236 _wraptext_append_token (context, c); 237 state = WT_STATE_WORD; 238 } 239 240 // STATE_WORD 241 242 else if (state == WT_STATE_WORD && c == WT_HARD_LINE_BREAK) 243 { 244 _wraptext_flush_token (context); 245 _wraptext_new_line (context); 246 state = WT_STATE_START; 247 } 248 else if (state == WT_STATE_WORD && _wraptext_is_newline (c)) 249 { 250 _wraptext_flush_token (context); 251 state = WT_STATE_START; 252 } 253 else if (state == WT_STATE_WORD && _wraptext_is_white (c)) 254 { 255 _wraptext_flush_token (context); 256 state = WT_STATE_WHITE; 257 } 258 else if (state == WT_STATE_WORD) 259 { 260 _wraptext_append_token (context, c); 261 state = WT_STATE_WORD; 262 } 263 264 // STATE_WHITE 265 266 else if (state == WT_STATE_WHITE && _wraptext_is_newline (c)) 267 { 268 _wraptext_flush_token (context); 269 state = WT_STATE_START; 270 } 271 else if (state == WT_STATE_WHITE && _wraptext_is_white (c)) 272 { 273 state = WT_STATE_WHITE; 274 } 275 else if (state == WT_STATE_WHITE) 276 { 277 _wraptext_append_token (context, c); 278 state = WT_STATE_WORD; 279 } 280 281 // We should ever get here 282 else 283 { 284 fprintf (stderr, "Internal error: char %d in state %d\n", c, state); 285 exit (-1); 286 } 287 288 context->priv->last = last; 289 context->priv->state = state; 290 } 291 292 293 void wraptext_eof (WrapTextContext *context) 294 { 295 // Handle any input that has not been handled already 296 _wraptext_flush_token (context); 297 } 298 299 300 void wraptext_wrap_utf32 (WrapTextContext *context, const WT_UTF32 *utf32) 301 { 302 int i, len = wraptext_utf32_length (utf32); 303 for (i = 0; i < len; i++) 304 { 305 WT_UTF32 c = utf32[i]; 306 _wraptext_wrap_next (context, c); 307 } 308 } 309 310 311 void wraptext_easy_stdout_utf32 (const int width, const WT_UTF32 *utf32, 312 int flags) 313 { 314 WrapTextContext *context = wraptext_context_new(); 315 wraptext_context_set_output_fn (context, _stdout_output_fn); 316 wraptext_context_set_flags (context, flags); 317 wraptext_context_set_width (context, width); 318 wraptext_wrap_utf32 (context, utf32); 319 wraptext_eof (context); 320 wraptext_context_free (context); 321 } 322 323 324 WrapTextContext *wraptext_context_new (void) 325 { 326 WrapTextContext *self = malloc (sizeof (WrapTextContext)); 327 memset (self, 0, sizeof (WrapTextContext)); 328 WrapTextContextPriv *priv = malloc (sizeof (WrapTextContextPriv)); 329 memset (priv, 0, sizeof (WrapTextContextPriv)); 330 self->priv = priv; 331 self->priv->width = 80; 332 self->priv->blank_line = TRUE; // Assume that we are starting on a new line 333 self->priv->outputFn = _stdout_output_fn; 334 wraptext_context_reset (self); 335 return self; 336 } 337 338 339 void wraptext_context_reset (WrapTextContext *self) 340 { 341 self->priv->state = WT_STATE_START; 342 self->priv->column = 0; 343 self->priv->last = 0; 344 self->priv->white_count = 0; 345 self->priv->fmt = 0; 346 self->priv->blank_line = TRUE; 347 if (self->priv->token) free (self->priv->token); 348 self->priv->token = NULL; 349 } 350 351 352 void wraptext_context_set_output_fn (WrapTextContext *self, 353 WrapTextOutputFn fn) 354 { 355 self->priv->outputFn = fn; 356 } 357 358 359 void wraptext_context_set_width (WrapTextContext *self, int width) 360 { 361 self->priv->width = width; 362 } 363 364 void wraptext_context_set_flags (WrapTextContext *self, int flags) 365 { 366 self->priv->flags = flags; 367 } 368 369 void wraptext_context_zero_fmt (WrapTextContext *self) 370 { 371 self->priv->fmt = 0; 372 } 373 374 unsigned int wraptext_context_get_fmt (WrapTextContext *self) 375 { 376 return self->priv->fmt; 377 } 378 379 void wraptext_context_set_fmt (WrapTextContext *self, unsigned int fmt) 380 { 381 self->priv->fmt |= fmt; 382 } 383 384 void wraptext_context_reset_fmt (WrapTextContext *self, unsigned int fmt) 385 { 386 self->priv->fmt &= ~fmt; 387 } 388 389 void wraptext_context_set_app_opts (WrapTextContext *self, void *app_opts) 390 { 391 self->priv->app_opts = app_opts; 392 } 393 394 void *wraptext_context_get_app_opts (WrapTextContext *self) 395 { 396 return self->priv->app_opts; 397 } 398 399 void wraptext_context_set_app_data (WrapTextContext *self, void *app_data) 400 { 401 self->priv->app_data = app_data; 402 } 403 404 void wraptext_context_free (WrapTextContext *self) 405 { 406 if (!self) return; 407 if (self->priv) 408 { 409 free (self->priv); 410 self->priv = NULL; 411 } 412 free (self); 413 } 414 415 416 const int wraptext_utf32_length (const WT_UTF32 *s) 417 { 418 if (!s) return 0; 419 int i = 0; 420 WT_UTF32 c = 0; 421 do 422 { 423 c = s[i]; 424 i++; 425 } while (c != 0); 426 return i - 1; 427 } 428 429 430