htmltotext.php 9.23 KB
Newer Older
1
2
3
<?php
/**
 * Mahara: Electronic portfolio, weblog, resume builder and social networking
4
5
 * Copyright (C) 2006-2009 Catalyst IT Ltd and others; see:
 *                         http://wiki.mahara.org/Contributors
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * @package    mahara
 * @subpackage core
 * @author     Catalyst IT Ltd
 * @license    http://www.gnu.org/copyleft/gpl.html GNU GPL
24
 * @copyright  (C) 2006-2009 Catalyst IT Ltd http://catalyst.net.nz
25
26
27
28
29
30
31
32
33
34
 */

defined('INTERNAL') || die();

class HtmltoText {

    private $body;
    private $lines;
    private $line;
    private $prefix;
35
    private $newlines;
Richard Mansfield's avatar
Richard Mansfield committed
36
    private $indent;
37
38
39
    private $baseurl;
    private $links;
    private $linkcount;
40

41
    public function __construct($html, $baseurl) {
42
43
44
45
        $doc = new domDocument;
        $doc->loadHTML($html);
        $this->body = $doc->getElementsByTagName('html')->item(0)->getElementsByTagName('body')->item(0);
        $this->lines = array();
46
        $this->line = (object) array('text' => '', 'wrap' => true, 'prefix' => "\n");
47
        $this->pre = 0;
Richard Mansfield's avatar
Richard Mansfield committed
48
        $this->indent = array();
49
50
51
        $this->baseurl = $baseurl;
        $this->links = array();
        $this->linkcount = 0;
52
53
54
55
    }

    public function text() {
        $this->process_children($this->body);
56
57
58
59

        if (!empty($this->links)) {
            $this->para();
            foreach ($this->links as $link => $i) {
60
                $this->output("[$i] $link", false);
61
62
63
64
                $this->newline();
            }
        }

65
66
67
        if ($this->line) {
            $this->wrap_line();
        }
68

69
70
71
        return join("\n", $this->lines);
    }

72
73
74
75
76
77
    private function get_attributes($node) {
        $attrs = array();
        if ($node->hasAttributes()) {
            foreach ($node->attributes as $attr) {
                $attrs[$attr->name] = $attr->value;
            }
78
        }
79
        return $attrs;
80
81
82
    }

    private function wrap_line() {
83
        if ($this->line->wrap) {
84
            $this->lines[] = wordwrap($this->line->text, 75, $this->line->prefix);
85
86
87
88
        }
        else {
            $this->lines[] = $this->line->text;
        }
89
90
    }

91
92
93
    private function newline() {
        if ($this->newlines == 0) {
            $this->newlines = 1;
94
95
96
97
        }
    }

    private function para() {
98
        $this->newlines = 2;
99
100
    }

101
102
    private $indentfirstchar = array('bq' => '> ', 'list' => '- ', 'dd' => '   ', 'td' => '  ');
    private $indentchar      = array('bq' => '> ', 'list' => '  ', 'dd' => '   ', 'td' => '  ');
Richard Mansfield's avatar
Richard Mansfield committed
103

104
    private function output($str, $wrap=true) {
105
        if ($this->newlines) {
106
            $this->wrap_line();
107
            $this->line = (object) array('text' => '', 'wrap' => $wrap, 'prefix' => "\n");
Richard Mansfield's avatar
Richard Mansfield committed
108
109
            $totalindents = count($this->indent);
            if ($totalindents) {
110
                $this->line->text .= ' ';
Richard Mansfield's avatar
Richard Mansfield committed
111
                for ($i = 0; $i < $totalindents - 1; $i++) {
112
                    $this->line->text .= $this->indentchar[$this->indent[$i]];
Richard Mansfield's avatar
Richard Mansfield committed
113
                }
114
                $this->line->prefix .= $this->line->text . $this->indentchar[$this->indent[$i]];
115
                $this->line->text .= $this->indentfirstchar[$this->indent[$i]];
116
            }
117
118
119
            if ($this->newlines == 2) {
                $this->line->text = "\n" . $this->line->text;
            }
120
            $this->newlines = 0;
121
        }
122
123
        $this->line->text .= $str;
        $this->line->wrap &= $wrap;
124
125
126
    }

    private function process_children($node) {
127
        if (is_object($node) && $node->childNodes->length) {
128
129
130
131
132
133
134
135
            foreach ($node->childNodes as $child) {
                $this->process_node($child);
            }
        }
    }

    private function process_node($node) {
        if ($node->nodeType === XML_TEXT_NODE) {
136
137
138
139
140
141
            if ($this->newlines) {
                $text = ltrim($node->nodeValue);
                if (empty($text)) {
                    return;
                }
            }
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
            $this->output($node->nodeValue);
        }
        else if ($node->nodeType === XML_ELEMENT_NODE) {
            switch ($node->tagName) {
            case 'script':
            case 'style':
            case 'head':
                return;

            case 'hr':
                $this->para();
                $this->output('----------------------------------------------------------');
                $this->para();
                return;

            case 'br':
158
                $this->newline();
159
160
161
                return;

            case 'img':
162
163
                $attrs = $this->get_attributes($node);
                if (!empty($attrs['src'])) {
164
165
166
167
168
169
170
171
172
                    $href = $attrs['src'];
                    $alt = isset($attrs['alt']) ? $attrs['alt'] : '';
                    if (strpos($href, '://') == false) {
                        $href = $this->baseurl . $href;
                    }
                    if (!isset($this->links[$href])) {
                        $this->links[$href] = ++$this->linkcount;
                    }
                    $this->output('![' . $alt . '][' . $this->links[$href] . ']');
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
                }
                return;
            }

            if (!$node->childNodes->length) {
                return;
            }

            switch ($node->tagName) {

            case 'h1': case 'h2': case 'h3': case 'h4': case 'h5': case 'h6':
                $n = substr($node->tagName, 1, 1);
                $this->para();
                $this->output(str_repeat('#', $n) . ' ');
                $this->process_children($node);
                $this->para();
                break;
            
            case 'p': case 'div':
                $this->para();
                $this->process_children($node);
                $this->para();
                break;

            case 'blockquote':
                $this->para();
Richard Mansfield's avatar
Richard Mansfield committed
199
                $this->indent[] = 'bq';
200
                $this->process_children($node);
Richard Mansfield's avatar
Richard Mansfield committed
201
                array_pop($this->indent);
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
                $this->para();
                break;

            case 'em': case 'i': case 'u':
                $this->output('_');
                $this->process_children($node);
                $this->output('_');
                break;

            case 'strong': case 'b':
                $this->output('**');
                $this->process_children($node);
                $this->output('**');
                break;

            case 'dl':
                $this->para();
                $this->process_children($node);
                $this->para();
                break;

            case 'dt':
224
                $this->para();
225
                $this->process_children($node);
226
                $this->newline();
227
                break;
228

229
            case 'dd':
230
231
                $this->newline();
                $this->indent[] = 'dd';
232
                $this->process_children($node);
233
234
                $this->newline();
                array_pop($this->indent);
235
236
237
238
                break;

            case 'ol': case 'ul':
                $this->para();
Richard Mansfield's avatar
Richard Mansfield committed
239
                $this->indent[] = 'list';
240
241
                $this->process_children($node);
                $this->para();
Richard Mansfield's avatar
Richard Mansfield committed
242
                array_pop($this->indent);
243
244
245
                break;
                
            case 'li':
246
                $this->newline();
247
                $this->process_children($node);
248
                $this->newline();
249
250
251
252
253
                break;

            case 'table': case 'tr':
                $this->para();
                $this->process_children($node);
254
                $this->para();
255
256
                break;

257
            case 'td': case 'th':
258
                $this->newline();
259
                $this->indent[] = 'td';
260
                $this->process_children($node);
261
262
                $this->newline();
                array_pop($this->indent);
263
264
265
266
267
268
269
                break;

            case 'pre':
                $this->para();
                $this->process_children($node);
                break;

270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
            case 'a':
                $attrs = $this->get_attributes($node);
                $href = $attrs['href'];
                if (!empty($href) && substr($href, 0, 1) != '#' && substr($href, 0, 11) != 'javascript:') {
                    if (strpos($href, '://') == false) {
                        $href = $this->baseurl . $href;
                    }
                    if (!isset($this->links[$href])) {
                        $this->links[$href] = ++$this->linkcount;
                    }
                    $this->output('[');
                }
                else {
                    $href = null;
                }
                $this->process_children($node);
                if (!empty($href)) {
                    $this->output('][' . $this->links[$href] . ']');
                }
                break;

291
292
293
294
295
296
297
298
            default:
                $this->process_children($node);
            }
        }
    }
}

?>