sitemap.php 12.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
<?php
/**
 * Mahara: Electronic portfolio, weblog, resume builder and social networking
 * Copyright (C) 2011 Catalyst IT Ltd and others; see:
 *                         http://wiki.mahara.org/Contributors
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 * @package    mahara
 * @subpackage core
 * @author     Darryl Hamilton
 * @license    http://www.gnu.org/copyleft/gpl.html GNU GPL
 * @copyright  (C) 2011 Catalyst IT Ltd http://catalyst.net.nz
 *
 */

defined('INTERNAL') || die();

/**
 * The core sitemap class which generates the sitemaps.org standard sitemap files.
 */
class Sitemap {

    /**
     * @var null|string The date to check against for limiting what goes into the sitemap
     */
    private $date_to_check;

    /**
     * @var array Array of sitemap files
     */
    private $sitemaps = array();

    /**
     * @var int Maximum size an individual sitemap or sitemap index file can be
     */
    private $maxfilesize = 10485760; // 10MB

    /**
     * @var int Maximum number of URLs allowed in a sitemap or sitemap index file
     */
    private $maxurlcount = 50000; // 50k

    /**
     * @var float How close to the maximums we can get before starting anew
     */
    private $gracesize = 0.90;

    /**
     * @var DOMDocument The sitemap currently getting added to
     */
    private $currentsitemap;

    /**
     * @var DOMElement The urlset for $current_sitemap
     */
    private $currenturlset;

70
71
72
73
74
    /**
     * @var string The directory in which to put sitemap files
     */
    private $directory;

75
76
77
78
    /**
     * @param bool $forcefull Force generation of a full sitemap (non-daily)
     */
    public function __construct($forcefull = false) {
79
80
        $this->directory = get_config('dataroot') . 'sitemaps/';

81
82
83
        // on the first of the month, or if forced, generate the full sitemap
        if (date("d") == 1 || $forcefull === true) {
            $this->date_to_check = null;
84
85
86
87
88
89
            $remove = 'sitemap_*.xml';
        }
        else { // otherwise limit to 'yesterday'
            $this->date_to_check = date("Y-m-d", strtotime('yesterday'));
            $remove = 'sitemap_' . date('Ymd') . '_*.xml';
        }
90

91
92
93
94
95
96
97
98
99
100
        // remove any sitemaps we're about to replace
        if (!$oldsitemaps = glob($this->directory . $remove)) {
            $oldsitemaps = array();
        }
        if ($oldcompressed = glob($this->directory . $remove . '.gz')) {
            $oldsitemaps = array_merge($oldsitemaps, $oldcompressed);
        }
        foreach ($oldsitemaps as $sitemap) {
            if (!unlink($sitemap)) {
                log_warn(sprintf("Failed to remove sitemap: %s, please check directory and file permissions.", basename($sitemap)));
101
            }
102
103
104
105
106
107
108
109
110
111
112
        }
    }

    /**
     * Generate sitemap(s) and an index
     *
     * @return bool
     */
    public function generate() {

        // check that the sitemaps directory exists and create it if it doesn't
113
        check_dir_exists($this->directory, true);
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206

        // this is used by PluginInteractionForum::get_active_topics
        $USER = new User();

        // create a new sitemap
        $this->create_sitemap();

        // get a list of public groups
        $publicgroups = get_records_select_array('group', 'public = 1 AND deleted = 0');
        if (!empty($publicgroups)) {
            foreach ($publicgroups as $group) {
                if (isset($group->mtime) && $this->check_date($group->mtime)) {
                    // each group gets a url entry
                    $groupurl     = get_config('wwwroot') . 'group/view.php?id=' . $group->id;
                    $groupurl     = utf8_encode(htmlspecialchars($groupurl, ENT_QUOTES, 'UTF-8'));
                    $grouplastmod = format_date(strtotime($group->mtime), 'strftimew3cdate');

                    $this->add_url($groupurl, $grouplastmod);
                }

                // build a list of forums in each public group
                $forums = get_forum_list($group->id);
                $forumids = array();
                foreach ($forums as $forum) {
                    $forumids[] = $forum->id;
                }

                // active topics within the specified forums (public only)
                $activetopics = PluginInteractionForum::get_active_topics(0, 0, 0, $forumids);
                foreach ($activetopics['data'] as $topic) {
                    if (
                        (isset($topic->mtime) && $this->check_date($topic->mtime))
                        || (isset($topic->ctime) && $this->check_date($topic->ctime))
                    ) {

                        $forumurl = get_config('wwwroot') . 'interaction/forum/topic.php?id=' . $topic->id;
                        $forumurl = utf8_encode(htmlspecialchars($forumurl, ENT_QUOTES, 'UTF-8'));

                        // mtime will be set if the last post has been edited
                        if (isset($topic->mtime) && strtotime($topic->mtime) !== FALSE) {
                            $forumlastmod = format_date(strtotime($topic->mtime), 'strftimew3cdate');
                        } // otherwise, use the last post creation date
                        else {
                            $forumlastmod = format_date(strtotime($topic->ctime), 'strftimew3cdate');
                        }

                        $this->add_url($forumurl, $forumlastmod);
                    }
                }
            }
        }

        // views shared with the public
        // grouphomepage type views are handled above
        $types = array('portfolio');
        $views = View::view_search(null, null, null, null, null, 0, true, null, $types);
        if (!empty($views->data)) {
            foreach ($views->data as $view) {
                if (isset($view['mtime']) && $this->check_date($view['mtime'])) {
                    $viewurl = get_config('wwwroot') . 'view/view.php?id=' . $view['id'];
                    $viewurl = utf8_encode(htmlspecialchars($viewurl, ENT_QUOTES, 'UTF-8'));
                    $viewlastmod = format_date(strtotime($view['mtime']), 'strftimew3cdate');

                    $this->add_url($viewurl, $viewlastmod);
                }
            }
        }

        // add the urlset and print the xml out
        // only if the urlset has any children
        if ($this->currenturlset->hasChildNodes()) {
            $this->save_sitemap(true);
        }

        return true;
    }

    /**
     * Generate the sitemap index file
     *
     * Assumption - there is currently no checking done on the size of the sitemap index
     * file, as we're not expecting that limit to be reached for some time.
     */
    public function generate_index() {
        // main index file
        $doc = new DOMDocument('1.0', 'utf-8');

        // root node
        $sitemapindex = $doc->createElement('sitemapindex');
        $sitemapindex->setAttribute('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9');

        // step through each sitemap we have generated
        foreach ($this->sitemaps as $key => $sitemap) {
207
            $filename = sprintf("%ssitemap_%s_%d.xml", $this->directory, date("Ymd"), $key);
208
209
210
211
212
213
214
            // if the save succeeded, add it to the index
            if ($sitemap->save($filename) !== false) {
                // try to gzip the xml file
                if (is_executable(get_config('pathtogzip'))) {
                    $command = sprintf('%s %s', get_config('pathtogzip'), escapeshellarg($filename));
                    $output = array();
                    exec($command, $output, $returnvar);
215
                    if ($returnvar != 0) {
216
217
218
219
220
221
                        log_warn('gzip command failed.');
                    }
                }
                else {
                    log_info('Skipping compression of xml file - gzip command not found, or not executable.');
                }
222
223
224
225
226
            }
            else {
                throw new SystemException(sprintf("Saving of this sitemap file failed: %s", $filename));
            }
        }
227

228
        // get a list of sitemaps in the sitemap directory
229
        $sitemaps = glob($this->directory . 'sitemap_*.xml*');
230
231
232
        foreach ($sitemaps as $sitemap) {
            // create a <sitemap> node for each one we're adding
            $sitemapelement = $doc->createElement('sitemap');
233

234
            // create and encode the url
235
            $sitemapurl = sprintf("%sdownload.php?type=sitemap&name=%s", get_config('wwwroot'), basename($sitemap));
236
            $sitemapurl = utf8_encode(htmlspecialchars($sitemapurl, ENT_QUOTES, 'UTF-8'));
237

238
239
240
            // add it to the <sitemap> node
            $loc = $doc->createElement('loc', $sitemapurl);
            $sitemapelement->appendChild($loc);
241

242
243
            // formatted date, uses the files modified date
            $sitemaplastmod = format_date(filemtime($sitemap), 'strftimew3cdate');
244

245
246
247
            // add it to the <sitemap> node
            $lastmod = $doc->createElement('lastmod', $sitemaplastmod);
            $sitemapelement->appendChild($lastmod);
248

249
250
            // add this <sitemap> node to the parent index
            $sitemapindex->appendChild($sitemapelement);
251
252
253
254
        }

        // add the index to the main doc and save it
        $doc->appendChild($sitemapindex);
255
        $indexfilename = sprintf("%ssitemap_index.xml", $this->directory);
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
        $doc->save($indexfilename);
    }

    /**
     * @param   string  $date   The date to compare with the check date
     * @return  bool Returns true if the specified date is within the bounds of the check date
     */
    private function check_date($date) {
        $time = strtotime($date);
        if ($time !== false && strtotime($this->date_to_check) !== false) {
            $starttime = strtotime($this->date_to_check);
            $endtime   = strtotime(sprintf("%s 23:59:59", $this->date_to_check));
            return ($time >= $starttime && $time <= $endtime);
        }

        // null is used for the monthly 'grab everything' sitemap
        return is_null($this->date_to_check);
    }

    /**
     * Create a new sitemap and urlset and assign to class variables
     */
    private function create_sitemap() {
        $this->currentsitemap = new DOMDocument('1.0', 'utf-8');

        $this->currenturlset = $this->currentsitemap->createElement('urlset');
        $this->currenturlset->setAttribute('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9');
    }

    /**
     * Add the urlset to the current sitemap and save it into the array
     *
     * @param   bool    $final      If this is the last sitemap, generate an index file
     */
    private function save_sitemap($final = false) {
        $this->currentsitemap->appendChild($this->currenturlset);
        $this->sitemaps[] = $this->currentsitemap;

        if ($final) {
            // save to file(s), generate index
            $this->generate_index();
        }
    }

    /**
     * Add a url to a urlset
     *
     * @param   string  $loc        The url
     * @param   string  $lastmod    The last modification time
     */
    private function add_url($loc, $lastmod) {
        // if the number of urls is within the grace size of the maximum url count
        // or the size of the sitemap is within the grace size of the maximum file size
        // save the current sitemap and start a new one.
        $filesize = mb_strlen($this->currentsitemap->saveXML(), 'UTF-8');
        $urlcount = $this->currenturlset->childNodes->length;
        if ($urlcount >= ($this->maxurlcount * $this->gracesize) || $filesize >= ($this->maxfilesize * $this->gracesize)) {
            $this->save_sitemap();
            $this->create_sitemap();

            // log a note that filesize or url count limits were reached
            log_info('New sitemap created due to filesize or url count limits');
        }

        $url = $this->create_and_append('url', '');
        $this->create_and_append('loc', $loc, $url);
        $this->create_and_append('lastmod', $lastmod, $url);
    }

    /**
     * @param   string       $name       The name of the element to add
     * @param   mixed        $value      The value of the element
     * @param   mixed        $element    The element to append to
     * @return  DOMElement               The newly created element
     */
    private function create_and_append($name, $value, &$element = null) {
        $e = $this->currentsitemap->createElement($name, $value);
        if (is_null($element)) {
            $element = $this->currenturlset;
        }
        $element->appendChild($e);

        return $e;
    }
}