Commit af0e933f authored by Darryl Hamilton's avatar Darryl Hamilton
Browse files

Feature: Generate a sitemap (bug #793815)

Code to generate a sitemap generation code, using the sitemaps.org
standard/protocol. The default generation is daily, with a
full generation done on the first of the month. The full
generation can also be forced by passing 'true' to the constructor.

Sitemap file(s) are placed in a 'sitemaps' directory just off
the docroot, with an index file placed in the docroot itself. If
the 'sitemaps' directory does not exist, one will be created.

More information can be found on
https://wiki.mahara.org/index.php/Developer_Area/Specifications_in_Development/Sitemaps



Change-Id: Ib28983ca1c3b5ff06a6e603f402460531b987067
Signed-off-by: default avatarDarryl Hamilton <darrylh@catalyst.net.nz>
parent b43f8c57
......@@ -457,6 +457,14 @@ $siteoptionform = array(
'help' => true,
'disabled' => in_array('allowpublicprofiles', $OVERRIDDEN),
),
'generatesitemap' => array(
'type' => 'checkbox',
'title' => get_string('generatesitemap', 'admin'),
'description' => get_string('generatesitemapdescription', 'admin'),
'defaultvalue' => get_config('generatesitemap'),
'help' => true,
'disabled' => in_array('generatesitemap', $OVERRIDDEN) || !get_config('allowpublicviews'),
),
'showselfsearchsideblock' => array(
'type' => 'checkbox',
'title' => get_string('showselfsearchsideblock', 'admin'),
......@@ -543,7 +551,7 @@ function siteoptions_submit(Pieform $form, $values) {
$fields = array(
'sitename','lang','theme', 'pathtoclam',
'defaultaccountlifetime', 'defaultaccountinactiveexpire', 'defaultaccountinactivewarn',
'allowpublicviews', 'allowpublicprofiles',
'allowpublicviews', 'allowpublicprofiles', 'generatesitemap',
'registration_sendweeklyupdates', 'institutionexpirynotification', 'institutionautosuspend',
'showselfsearchsideblock', 'searchusernames', 'showtagssideblock',
'tagssideblockmaxtags', 'country', 'viewmicroheaders', 'userscanchooseviewthemes',
......@@ -552,6 +560,12 @@ function siteoptions_submit(Pieform $form, $values) {
'noreplyaddress', 'homepageinfo', 'showonlineuserssideblock', 'registerterms', 'allowmobileuploads',
'creategroups', 'createpublicgroups', 'allowgroupcategories', 'wysiwyg',
);
// if public views are disabled, sitemap generation must also be disabled.
if ($values['allowpublicviews'] == false) {
$values['generatesitemap'] = false;
}
$oldlanguage = get_config('lang');
$oldtheme = get_config('theme');
foreach ($fields as $field) {
......
......@@ -232,6 +232,8 @@ $string['defaultaccountlifetimedescription'] = 'If set, user accounts will expir
$string['embeddedcontent'] = 'Embedded content';
$string['embeddedcontentdescription'] = 'If you would like users to be able to embed videos or other outside content into their portfolios, you can choose which sites to trust below.';
$string['Everyone'] = 'Everyone';
$string['generatesitemap'] = 'Generate Sitemap';
$string['generatesitemapdescription'] = 'Generate sitemap files from publicly accessible views, groups, and forum topics';
$string['homepageinfo'] = 'Show home page information';
$string['homepageinfodescription'] = 'If enabled, information about Mahara and how it is used will be displayed on the Mahara home page. Logged in users will have the option to disable it.';
$string['institutionautosuspend'] = 'Auto-suspend expired institutions';
......
......@@ -190,6 +190,7 @@ $cfg->imagemaxwidth = 1024;
$cfg->imagemaxheight = 1024;
// paths and arguments for various system commands
$cfg->pathtogzip = '/bin/gzip';
$cfg->pathtounzip = '/usr/bin/unzip';
$cfg->pathtozip = '/usr/bin/zip';
$cfg->ziprecursearg = '-r';
......
......@@ -2358,6 +2358,16 @@ function xmldb_core_upgrade($oldversion=0) {
$field = new XMLDBField('retainview');
$field->setAttributes(XMLDB_TYPE_INTEGER, 1, null, XMLDB_NOTNULL, null, null, null, 0);
add_field($table, $field);
// Install a cron job to generate the sitemap
$cron = new StdClass;
$cron->callfunction = 'cron_sitemap_daily';
$cron->minute = '0';
$cron->hour = '1';
$cron->day = '*';
$cron->month = '*';
$cron->dayofweek = '*';
insert_record('cron', $cron);
}
return $status;
......
......@@ -2606,6 +2606,21 @@ function cron_site_data_daily() {
graph_site_data_daily();
}
/**
* A cronjob to generate a sitemap
*/
function cron_sitemap_daily() {
require_once(get_config('libroot') . 'searchlib.php');
require_once(get_config('libroot') . 'group.php');
require_once(get_config('libroot') . 'view.php');
require_once(get_config('libroot') . 'sitemap.php');
safe_require('interaction', 'forum');
$sitemap = new Sitemap();
$sitemap->generate();
}
function build_portfolio_search_html(&$data) {
global $THEME;
$artefacttypes = get_records_assoc('artefact_installed_type');
......
<?php
/**
* Mahara: Electronic portfolio, weblog, resume builder and social networking
* Copyright (C) 2011 Catalyst IT Ltd and others; see:
* http://wiki.mahara.org/Contributors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* @package mahara
* @subpackage core
* @author Darryl Hamilton
* @license http://www.gnu.org/copyleft/gpl.html GNU GPL
* @copyright (C) 2011 Catalyst IT Ltd http://catalyst.net.nz
*
*/
defined('INTERNAL') || die();
/**
* The core sitemap class which generates the sitemaps.org standard sitemap files.
*/
class Sitemap {
/**
* @var null|string The date to check against for limiting what goes into the sitemap
*/
private $date_to_check;
/**
* @var array Array of sitemap files
*/
private $sitemaps = array();
/**
* @var int Maximum size an individual sitemap or sitemap index file can be
*/
private $maxfilesize = 10485760; // 10MB
/**
* @var int Maximum number of URLs allowed in a sitemap or sitemap index file
*/
private $maxurlcount = 50000; // 50k
/**
* @var float How close to the maximums we can get before starting anew
*/
private $gracesize = 0.90;
/**
* @var DOMDocument The sitemap currently getting added to
*/
private $currentsitemap;
/**
* @var DOMElement The urlset for $current_sitemap
*/
private $currenturlset;
/**
* @param bool $forcefull Force generation of a full sitemap (non-daily)
*/
public function __construct($forcefull = false) {
// on the first of the month, or if forced, generate the full sitemap
if (date("d") == 1 || $forcefull === true) {
$this->date_to_check = null;
} // otherwise limit to 'yesterday'
else {
$this->date_to_check = date("Y-m-d", strtotime('yesterday'));
}
}
/**
* Generate sitemap(s) and an index
*
* @return bool
*/
public function generate() {
$generatesitemap = get_config('generatesitemap');
if (!$generatesitemap) {
log_info('Sitemap generation has been disabled.');
return false;
}
// check that the sitemaps directory exists and create it if it doesn't
check_dir_exists(get_config('docroot') . 'sitemaps', true);
// this is used by PluginInteractionForum::get_active_topics
$USER = new User();
// create a new sitemap
$this->create_sitemap();
// get a list of public groups
$publicgroups = get_records_select_array('group', 'public = 1 AND deleted = 0');
if (!empty($publicgroups)) {
foreach ($publicgroups as $group) {
if (isset($group->mtime) && $this->check_date($group->mtime)) {
// each group gets a url entry
$groupurl = get_config('wwwroot') . 'group/view.php?id=' . $group->id;
$groupurl = utf8_encode(htmlspecialchars($groupurl, ENT_QUOTES, 'UTF-8'));
$grouplastmod = format_date(strtotime($group->mtime), 'strftimew3cdate');
$this->add_url($groupurl, $grouplastmod);
}
// build a list of forums in each public group
$forums = get_forum_list($group->id);
$forumids = array();
foreach ($forums as $forum) {
$forumids[] = $forum->id;
}
// active topics within the specified forums (public only)
$activetopics = PluginInteractionForum::get_active_topics(0, 0, 0, $forumids);
foreach ($activetopics['data'] as $topic) {
if (
(isset($topic->mtime) && $this->check_date($topic->mtime))
|| (isset($topic->ctime) && $this->check_date($topic->ctime))
) {
$forumurl = get_config('wwwroot') . 'interaction/forum/topic.php?id=' . $topic->id;
$forumurl = utf8_encode(htmlspecialchars($forumurl, ENT_QUOTES, 'UTF-8'));
// mtime will be set if the last post has been edited
if (isset($topic->mtime) && strtotime($topic->mtime) !== FALSE) {
$forumlastmod = format_date(strtotime($topic->mtime), 'strftimew3cdate');
} // otherwise, use the last post creation date
else {
$forumlastmod = format_date(strtotime($topic->ctime), 'strftimew3cdate');
}
$this->add_url($forumurl, $forumlastmod);
}
}
}
}
// views shared with the public
// grouphomepage type views are handled above
$types = array('portfolio');
$views = View::view_search(null, null, null, null, null, 0, true, null, $types);
if (!empty($views->data)) {
foreach ($views->data as $view) {
if (isset($view['mtime']) && $this->check_date($view['mtime'])) {
$viewurl = get_config('wwwroot') . 'view/view.php?id=' . $view['id'];
$viewurl = utf8_encode(htmlspecialchars($viewurl, ENT_QUOTES, 'UTF-8'));
$viewlastmod = format_date(strtotime($view['mtime']), 'strftimew3cdate');
$this->add_url($viewurl, $viewlastmod);
}
}
}
// add the urlset and print the xml out
// only if the urlset has any children
if ($this->currenturlset->hasChildNodes()) {
$this->save_sitemap(true);
}
return true;
}
/**
* Generate the sitemap index file
*
* Assumption - there is currently no checking done on the size of the sitemap index
* file, as we're not expecting that limit to be reached for some time.
*/
public function generate_index() {
// main index file
$doc = new DOMDocument('1.0', 'utf-8');
// root node
$sitemapindex = $doc->createElement('sitemapindex');
$sitemapindex->setAttribute('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9');
// step through each sitemap we have generated
foreach ($this->sitemaps as $key => $sitemap) {
$filename = sprintf("%ssitemaps/sitemap_%d.xml", get_config('docroot'), $key);
// if the save succeeded, add it to the index
if ($sitemap->save($filename) !== false) {
$gzip = '';
// try to gzip the xml file
if (is_executable(get_config('pathtogzip'))) {
$command = sprintf('%s %s', get_config('pathtogzip'), escapeshellarg($filename));
$output = array();
exec($command, $output, $returnvar);
if ($returnvar == 0) {
$gzip = '.gz';
}
else {
log_warn('gzip command failed.');
}
}
else {
log_info('Skipping compression of xml file - gzip command not found, or not executable.');
}
// create a <sitemap> node for each one we're adding
$sitemapelement = $doc->createElement('sitemap');
// create and encode the url
$sitemapurl = sprintf("%ssitemaps/sitemap_%d.xml%s", get_config('wwwroot'), $key, $gzip);
$sitemapurl = utf8_encode(htmlspecialchars($sitemapurl, ENT_QUOTES, 'UTF-8'));
// add it to the <sitemap> node
$loc = $doc->createElement('loc', $sitemapurl);
$sitemapelement->appendChild($loc);
// formatted date, assumption that today is when they're all created
$sitemaplastmod = format_date(time(), 'strftimew3cdate');
// add it to the <sitemap> node
$lastmod = $doc->createElement('lastmod', $sitemaplastmod);
$sitemapelement->appendChild($lastmod);
// add this <sitemap> node to the parent index
$sitemapindex->appendChild($sitemapelement);
}
else {
throw new SystemException(sprintf("Saving of this sitemap file failed: %s", $filename));
}
}
// add the index to the main doc and save it
$doc->appendChild($sitemapindex);
$indexfilename = sprintf("%ssitemap_index.xml", get_config('docroot'));
$doc->save($indexfilename);
}
/**
* @param string $date The date to compare with the check date
* @return bool Returns true if the specified date is within the bounds of the check date
*/
private function check_date($date) {
$time = strtotime($date);
if ($time !== false && strtotime($this->date_to_check) !== false) {
$starttime = strtotime($this->date_to_check);
$endtime = strtotime(sprintf("%s 23:59:59", $this->date_to_check));
return ($time >= $starttime && $time <= $endtime);
}
// null is used for the monthly 'grab everything' sitemap
return is_null($this->date_to_check);
}
/**
* Create a new sitemap and urlset and assign to class variables
*/
private function create_sitemap() {
$this->currentsitemap = new DOMDocument('1.0', 'utf-8');
$this->currenturlset = $this->currentsitemap->createElement('urlset');
$this->currenturlset->setAttribute('xmlns', 'http://www.sitemaps.org/schemas/sitemap/0.9');
}
/**
* Add the urlset to the current sitemap and save it into the array
*
* @param bool $final If this is the last sitemap, generate an index file
*/
private function save_sitemap($final = false) {
$this->currentsitemap->appendChild($this->currenturlset);
$this->sitemaps[] = $this->currentsitemap;
if ($final) {
// save to file(s), generate index
$this->generate_index();
}
}
/**
* Add a url to a urlset
*
* @param string $loc The url
* @param string $lastmod The last modification time
*/
private function add_url($loc, $lastmod) {
// if the number of urls is within the grace size of the maximum url count
// or the size of the sitemap is within the grace size of the maximum file size
// save the current sitemap and start a new one.
$filesize = mb_strlen($this->currentsitemap->saveXML(), 'UTF-8');
$urlcount = $this->currenturlset->childNodes->length;
if ($urlcount >= ($this->maxurlcount * $this->gracesize) || $filesize >= ($this->maxfilesize * $this->gracesize)) {
$this->save_sitemap();
$this->create_sitemap();
// log a note that filesize or url count limits were reached
log_info('New sitemap created due to filesize or url count limits');
}
$url = $this->create_and_append('url', '');
$this->create_and_append('loc', $loc, $url);
$this->create_and_append('lastmod', $lastmod, $url);
}
/**
* @param string $name The name of the element to add
* @param mixed $value The value of the element
* @param mixed $element The element to append to
* @return DOMElement The newly created element
*/
private function create_and_append($name, $value, &$element = null) {
$e = $this->currentsitemap->createElement($name, $value);
if (is_null($element)) {
$element = $this->currenturlset;
}
$element->appendChild($e);
return $e;
}
}
......@@ -709,6 +709,7 @@ function core_install_firstcoredata_defaults() {
set_config('createpublicgroups', 'all');
set_config('allowpublicviews', 1);
set_config('allowpublicprofiles', 1);
set_config('generatesitemap', 1);
set_config('showselfsearchsideblock', 0);
set_config('showtagssideblock', 1);
set_config('tagssideblockmaxtags', 20);
......@@ -823,6 +824,7 @@ function core_install_firstcoredata_defaults() {
'cron_site_data_daily' => array('51', '23', '*', '*', '*'),
'cron_check_for_updates' => array(rand(0, 59), rand(0, 23), '*', '*', '*'),
'cron_clean_internal_activity_notifications'=> array(45, 22, '*', '*', '*'),
'cron_sitemap_daily' => array(0, 1, '*', '*', '*'),
);
foreach ($cronjobs as $callfunction => $times) {
$cron = new StdClass;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment