Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | Download | RSS feed

  1. /*
  2.  * Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk>
  3.  * Copyright 2008 Michael Drake <tlsa@netsurf-browser.org>
  4.  *
  5.  * This file is part of NetSurf, http://www.netsurf-browser.org/
  6.  *
  7.  * NetSurf is free software; you can redistribute it and/or modify
  8.  * it under the terms of the GNU General Public License as published by
  9.  * the Free Software Foundation; version 2 of the License.
  10.  *
  11.  * NetSurf is distributed in the hope that it will be useful,
  12.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14.  * GNU General Public License for more details.
  15.  *
  16.  * You should have received a copy of the GNU General Public License
  17.  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18.  */
  19.  
  20. /** \file
  21.   * Text export of HTML (implementation).
  22.   */
  23.  
  24. #include <assert.h>
  25. #include <stdbool.h>
  26. #include <string.h>
  27.  
  28. #include <dom/dom.h>
  29.  
  30. #include "utils/config.h"
  31. #include "content/content.h"
  32. #include "content/hlcache.h"
  33. #include "desktop/save_text.h"
  34. #include "render/box.h"
  35. #include "render/html.h"
  36. #include "utils/log.h"
  37. #include "utils/utf8.h"
  38. #include "utils/utils.h"
  39.  
  40. static void extract_text(struct box *box, bool *first,
  41.                 save_text_whitespace *before, struct save_text_state *save);
  42. static bool save_text_add_to_buffer(const char *text, size_t length,
  43.                 struct box *box, const char *whitespace_text,
  44.                 size_t whitespace_length, struct save_text_state *save);
  45.  
  46.  
  47. /**
  48.  * Extract the text from an HTML content and save it as a text file. Text is
  49.  * converted to the local encoding.
  50.  *
  51.  * \param  c            An HTML content.
  52.  * \param  path         Path to save text file too.
  53.  */
  54.  
  55. void save_as_text(hlcache_handle *c, char *path)
  56. {
  57.         FILE *out;
  58.         struct save_text_state save = { NULL, 0, 0 };
  59.         save_text_whitespace before = WHITESPACE_NONE;
  60.         bool first = true;
  61.         utf8_convert_ret ret;
  62.         char *result;
  63.  
  64.         if (!c || content_get_type(c) != CONTENT_HTML) {
  65.                 return;
  66.         }
  67.  
  68.         extract_text(html_get_box_tree(c), &first, &before, &save);
  69.         if (!save.block)
  70.                 return;
  71.  
  72.         ret = utf8_to_local_encoding(save.block, save.length, &result);
  73.         free(save.block);
  74.  
  75.         if (ret != UTF8_CONVERT_OK) {
  76.                 LOG(("failed to convert to local encoding, return %d", ret));
  77.                 return;
  78.         }
  79.  
  80.         out = fopen(path, "w");
  81.         if (out) {
  82.                 int res = fputs(result, out);
  83.  
  84.                 if (res < 0) {
  85.                         LOG(("Warning: write failed"));
  86.                 }
  87.  
  88.                 res = fputs("\n", out);
  89.                 if (res < 0) {
  90.                         LOG(("Warning: failed writing trailing newline"));
  91.                 }
  92.  
  93.                 fclose(out);
  94.         }
  95.  
  96.         free(result);
  97. }
  98.  
  99.  
  100. /**
  101.  * Decide what whitespace to place before the next bit of content-related text
  102.  * that is saved. Any existing whitespace is overridden if the whitespace for
  103.  * this box is more "significant".
  104.  *
  105.  * \param  box          Pointer to box.
  106.  * \param  first        Whether this is before the first bit of content-related
  107.  *                      text to be saved.
  108.  * \param  before       Type of whitespace currently intended to be placed
  109.  *                      before the next bit of content-related text to be saved.
  110.  *                      Updated if this box is worthy of more significant
  111.  *                      whitespace.
  112.  * \param  whitespace_text    Whitespace to place before next bit of
  113.  *                            content-related text to be saved.
  114.  *                            Updated if this box is worthy of more significant
  115.  *                            whitespace.
  116.  * \param  whitespace_length  Length of whitespace_text.
  117.  *                            Updated if this box is worthy of more significant
  118.  *                            whitespace.
  119.  */
  120.  
  121. void save_text_solve_whitespace(struct box *box, bool *first,
  122.                 save_text_whitespace *before, const char **whitespace_text,
  123.                 size_t *whitespace_length)
  124. {
  125.         /* work out what whitespace should be placed before the next bit of
  126.          * text */
  127.         if (*before < WHITESPACE_TWO_NEW_LINES &&
  128.                         /* significant box type */
  129.                         (box->type == BOX_BLOCK ||
  130.                          box->type == BOX_TABLE ||
  131.                          box->type == BOX_FLOAT_LEFT ||
  132.                          box->type == BOX_FLOAT_RIGHT) &&
  133.                         /* and not a list element */
  134.                         !box->list_marker &&
  135.                         /* and not a marker... */
  136.                         (!(box->parent && box->parent->list_marker == box) ||
  137.                          /* ...unless marker follows WHITESPACE_TAB */
  138.                          ((box->parent && box->parent->list_marker == box) &&
  139.                           *before == WHITESPACE_TAB))) {
  140.                 *before = WHITESPACE_TWO_NEW_LINES;
  141.         } else if (*before <= WHITESPACE_ONE_NEW_LINE &&
  142.                         (box->type == BOX_TABLE_ROW ||
  143.                          box->type == BOX_BR ||
  144.                          (box->type != BOX_INLINE &&
  145.                          (box->parent && box->parent->list_marker == box)) ||
  146.                          (box->parent && box->parent->style &&
  147.                           (css_computed_white_space(box->parent->style) ==
  148.                            CSS_WHITE_SPACE_PRE ||
  149.                            css_computed_white_space(box->parent->style) ==
  150.                            CSS_WHITE_SPACE_PRE_WRAP) &&
  151.                           box->type == BOX_INLINE_CONTAINER))) {
  152.                 if (*before == WHITESPACE_ONE_NEW_LINE)
  153.                         *before = WHITESPACE_TWO_NEW_LINES;
  154.                 else
  155.                         *before = WHITESPACE_ONE_NEW_LINE;
  156.         }
  157.         else if (*before < WHITESPACE_TAB &&
  158.                         (box->type == BOX_TABLE_CELL ||
  159.                          box->list_marker)) {
  160.                 *before = WHITESPACE_TAB;
  161.         }
  162.  
  163.         if (*first) {
  164.                 /* before the first bit of text to be saved; there is
  165.                  * no preceding whitespace */
  166.                 *whitespace_text = "";
  167.                 *whitespace_length = 0;
  168.         } else {
  169.                 /* set the whitespace that has been decided on */
  170.                 switch (*before) {
  171.                         case WHITESPACE_TWO_NEW_LINES:
  172.                                 *whitespace_text = "\n\n";
  173.                                 *whitespace_length = 2;
  174.                                 break;
  175.                         case WHITESPACE_ONE_NEW_LINE:
  176.                                 *whitespace_text = "\n";
  177.                                 *whitespace_length = 1;
  178.                                 break;
  179.                         case WHITESPACE_TAB:
  180.                                 *whitespace_text = "\t";
  181.                                 *whitespace_length = 1;
  182.                                 break;
  183.                         case WHITESPACE_NONE:
  184.                                 *whitespace_text = "";
  185.                                 *whitespace_length = 0;
  186.                                 break;
  187.                         default:
  188.                                 *whitespace_text = "";
  189.                                 *whitespace_length = 0;
  190.                                 break;
  191.                 }
  192.         }
  193. }
  194.  
  195.  
  196. /**
  197.  * Traverse though the box tree and add all text to a save buffer.
  198.  *
  199.  * \param  box          Pointer to box.
  200.  * \param  first        Whether this is before the first bit of content-related
  201.  *                      text to be saved.
  202.  * \param  before       Type of whitespace currently intended to be placed
  203.  *                      before the next bit of content-related text to be saved.
  204.  *                      Updated if this box is worthy of more significant
  205.  *                      whitespace.
  206.  * \param  save         our save_text_state workspace pointer
  207.  * \return true iff the file writing succeeded and traversal should continue.
  208.  */
  209.  
  210. void extract_text(struct box *box, bool *first, save_text_whitespace *before,
  211.                 struct save_text_state *save)
  212. {
  213.         struct box *child;
  214.         const char *whitespace_text = "";
  215.         size_t whitespace_length = 0;
  216.  
  217.         assert(box);
  218.  
  219.         /* If box has a list marker */
  220.         if (box->list_marker) {
  221.                 /* do the marker box before continuing with the rest of the
  222.                  * list element */
  223.                 extract_text(box->list_marker, first, before, save);
  224.         }
  225.  
  226.         /* read before calling the handler in case it modifies the tree */
  227.         child = box->children;
  228.  
  229.         save_text_solve_whitespace(box, first, before, &whitespace_text,
  230.                         &whitespace_length);
  231.  
  232.         if (box->type != BOX_BR && !((box->type == BOX_FLOAT_LEFT ||
  233.                         box->type == BOX_FLOAT_RIGHT) && !box->text) &&
  234.                         box->length > 0 && box->text) {
  235.                 /* Box meets criteria for export; add text to buffer */
  236.                 save_text_add_to_buffer(box->text, box->length, box,
  237.                                 whitespace_text, whitespace_length, save);
  238.                 *first = false;
  239.                 *before = WHITESPACE_NONE;
  240.         }
  241.  
  242.         /* Work though the children of this box, extracting any text */
  243.         while (child) {
  244.                 extract_text(child, first, before, save);
  245.                 child = child->next;
  246.         }
  247.  
  248.         return;
  249. }
  250.  
  251.  
  252. /**
  253.  * Add text to save text buffer. Any preceding whitespace or following space is
  254.  * also added to the buffer.
  255.  *
  256.  * \param  text         Pointer to text being added.
  257.  * \param  length       Length of text to be appended (bytes).
  258.  * \param  box          Pointer to text box.
  259.  * \param  whitespace_text    Whitespace to place before text for formatting
  260.  *                            may be NULL.
  261.  * \param  whitespace_length  Length of whitespace_text.
  262.  * \param  save         Our save_text_state workspace pointer.
  263.  * \return true iff the file writing succeeded and traversal should continue.
  264.  */
  265.  
  266. bool save_text_add_to_buffer(const char *text, size_t length, struct box *box,
  267.                 const char *whitespace_text, size_t whitespace_length,
  268.                 struct save_text_state *save)
  269. {
  270.         size_t new_length;
  271.         int space = 0;
  272.  
  273.         assert(save);
  274.  
  275.         if (box->space > 0)
  276.                 space = 1;
  277.  
  278.         if (whitespace_text)
  279.                 length += whitespace_length;
  280.  
  281.         new_length = save->length + whitespace_length + length + space;
  282.         if (new_length >= save->alloc) {
  283.                 size_t new_alloc = save->alloc + (save->alloc / 4);
  284.                 char *new_block;
  285.  
  286.                 if (new_alloc < new_length) new_alloc = new_length;
  287.  
  288.                 new_block = realloc(save->block, new_alloc);
  289.                 if (!new_block) return false;
  290.  
  291.                 save->block = new_block;
  292.                 save->alloc = new_alloc;
  293.         }
  294.         if (whitespace_text) {
  295.                 memcpy(save->block + save->length, whitespace_text,
  296.                                 whitespace_length);
  297.         }
  298.         memcpy(save->block + save->length + whitespace_length, text, length);
  299.         save->length += length;
  300.  
  301.         if (space == 1)
  302.                 save->block[save->length++] = ' ';
  303.  
  304.         return true;
  305. }
  306.