Commit ece284bb authored by Richard Mansfield's avatar Richard Mansfield
Browse files

Use HTMLPurifier in clean_text function

parent 81d40d0e
* This is a stub include that automatically configures the include path.
set_include_path(dirname(__FILE__) . PATH_SEPARATOR . get_include_path() );
require_once 'HTMLPurifier.php';
* Function wrapper for HTML Purifier for quick use.
* @note This function only includes the library when it is called. While
* this is efficient for instances when you only use HTML Purifier
* on a few of your pages, it murders bytecode caching. You still
* need to add HTML Purifier to your path.
* @note ''HTMLPurifier()'' is NOT the same as ''new HTMLPurifier()''
function HTMLPurifier($html, $config = null) {
static $purifier = false;
if (!$purifier) {
require_once 'HTMLPurifier.php';
$purifier = new HTMLPurifier();
return $purifier->purify($html, $config);
/*! @mainpage
* HTML Purifier is an HTML filter that will take an arbitrary snippet of
* HTML and rigorously test, validate and filter it into a version that
* is safe for output onto webpages. It achieves this by:
* -# Lexing (parsing into tokens) the document,
* -# Executing various strategies on the tokens:
* -# Removing all elements not in the whitelist,
* -# Making the tokens well-formed,
* -# Fixing the nesting of the nodes, and
* -# Validating attributes of the nodes; and
* -# Generating HTML from the purified tokens.
* However, most users will only need to interface with the HTMLPurifier
* class, so this massive amount of infrastructure is usually concealed.
* If you plan on working with the internals, be sure to include
* HTMLPurifier_ConfigSchema and HTMLPurifier_Config.
HTML Purifier 3.0.0 - Standards Compliant HTML Filtering
Copyright (C) 2006-2008 Edward Z. Yang
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
// constants are slow, but we'll make one exception
define('HTMLPURIFIER_PREFIX', dirname(__FILE__));
// every class has an undocumented dependency to these, must be included!
require_once 'HTMLPurifier/ConfigSchema.php'; // fatal errors if not included
require_once 'HTMLPurifier/Config.php';
require_once 'HTMLPurifier/Context.php';
require_once 'HTMLPurifier/Lexer.php';
require_once 'HTMLPurifier/Generator.php';
require_once 'HTMLPurifier/Strategy/Core.php';
require_once 'HTMLPurifier/Encoder.php';
require_once 'HTMLPurifier/ErrorCollector.php';
require_once 'HTMLPurifier/LanguageFactory.php';
'Core', 'CollectErrors', false, 'bool', '
Whether or not to collect errors found while filtering the document. This
is a useful way to give feedback to your users. <strong>Warning:</strong>
Currently this feature is very patchy and experimental, with lots of
possible error messages not yet implemented. It will not cause any problems,
but it may not help your users either. This directive has been available
since 2.0.0.
* Facade that coordinates HTML Purifier's subsystems in order to purify HTML.
* @note There are several points in which configuration can be specified
* for HTML Purifier. The precedence of these (from lowest to
* highest) is as follows:
* -# Instance: new HTMLPurifier($config)
* -# Invocation: purify($html, $config)
* These configurations are entirely independent of each other and
* are *not* merged.
* @todo We need an easier way to inject strategies, it'll probably end
* up getting done through config though.
class HTMLPurifier
public $version = '3.0.0';
public $config;
public $filters = array();
protected $strategy, $generator;
* Resultant HTMLPurifier_Context of last run purification. Is an array
* of contexts if the last called method was purifyArray().
* @public
public $context;
* Initializes the purifier.
* @param $config Optional HTMLPurifier_Config object for all instances of
* the purifier, if omitted, a default configuration is
* supplied (which can be overridden on a per-use basis).
* The parameter can also be any type that
* HTMLPurifier_Config::create() supports.
public function __construct($config = null) {
$this->config = HTMLPurifier_Config::create($config);
$this->strategy = new HTMLPurifier_Strategy_Core();
$this->generator = new HTMLPurifier_Generator();
* Adds a filter to process the output. First come first serve
* @param $filter HTMLPurifier_Filter object
public function addFilter($filter) {
$this->filters[] = $filter;
* Filters an HTML snippet/document to be XSS-free and standards-compliant.
* @param $html String of HTML to purify
* @param $config HTMLPurifier_Config object for this operation, if omitted,
* defaults to the config object specified during this
* object's construction. The parameter can also be any type
* that HTMLPurifier_Config::create() supports.
* @return Purified HTML
public function purify($html, $config = null) {
// todo: make the config merge in, instead of replace
$config = $config ? HTMLPurifier_Config::create($config) : $this->config;
// implementation is partially environment dependant, partially
// configuration dependant
$lexer = HTMLPurifier_Lexer::create($config);
$context = new HTMLPurifier_Context();
// our friendly neighborhood generator, all primed with configuration too!
$this->generator->generateFromTokens(array(), $config, $context);
$context->register('Generator', $this->generator);
// set up global context variables
if ($config->get('Core', 'CollectErrors')) {
// may get moved out if other facilities use it
$language_factory = HTMLPurifier_LanguageFactory::instance();
$language = $language_factory->create($config, $context);
$context->register('Locale', $language);
$error_collector = new HTMLPurifier_ErrorCollector($context);
$context->register('ErrorCollector', $error_collector);
// setup id_accumulator context, necessary due to the fact that
// AttrValidator can be called from many places
$id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
$context->register('IDAccumulator', $id_accumulator);
$html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
for ($i = 0, $size = count($this->filters); $i < $size; $i++) {
$html = $this->filters[$i]->preFilter($html, $config, $context);
// purified HTML
$html =
// list of tokens
// list of un-purified tokens
// un-purified HTML
$html, $config, $context
$config, $context
$config, $context
for ($i = $size - 1; $i >= 0; $i--) {
$html = $this->filters[$i]->postFilter($html, $config, $context);
$html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
$this->context =& $context;
return $html;
* Filters an array of HTML snippets
* @param $config Optional HTMLPurifier_Config object for this operation.
* See HTMLPurifier::purify() for more details.
* @return Array of purified HTML
public function purifyArray($array_of_html, $config = null) {
$context_array = array();
foreach ($array_of_html as $key => $html) {
$array_of_html[$key] = $this->purify($html, $config);
$context_array[$key] = $this->context;
$this->context = $context_array;
return $array_of_html;
* Singleton for enforcing just one HTML Purifier in your system
* @param $prototype Optional prototype HTMLPurifier instance to
* overload singleton with.
public static function &getInstance($prototype = null) {
static $htmlpurifier;
if (!$htmlpurifier || $prototype) {
if ($prototype instanceof HTMLPurifier) {
$htmlpurifier = $prototype;
} elseif ($prototype) {
$htmlpurifier = new HTMLPurifier($prototype);
} else {
$htmlpurifier = new HTMLPurifier();
return $htmlpurifier;
require_once 'HTMLPurifier/AttrTypes.php';
* Defines common attribute collections that modules reference
class HTMLPurifier_AttrCollections
* Associative array of attribute collections, indexed by name
public $info = array();
* Performs all expansions on internal data for use by other inclusions
* It also collects all attribute collection extensions from
* modules
* @param $attr_types HTMLPurifier_AttrTypes instance
* @param $modules Hash array of HTMLPurifier_HTMLModule members
public function __construct($attr_types, $modules) {
// load extensions from the modules
foreach ($modules as $module) {
foreach ($module->attr_collections as $coll_i => $coll) {
if (!isset($this->info[$coll_i])) {
$this->info[$coll_i] = array();
foreach ($coll as $attr_i => $attr) {
if ($attr_i === 0 && isset($this->info[$coll_i][$attr_i])) {
// merge in includes
$this->info[$coll_i][$attr_i] = array_merge(
$this->info[$coll_i][$attr_i], $attr);
$this->info[$coll_i][$attr_i] = $attr;
// perform internal expansions and inclusions
foreach ($this->info as $name => $attr) {
// merge attribute collections that include others
// replace string identifiers with actual attribute objects
$this->expandIdentifiers($this->info[$name], $attr_types);
* Takes a reference to an attribute associative array and performs
* all inclusions specified by the zero index.
* @param &$attr Reference to attribute array
public function performInclusions(&$attr) {
if (!isset($attr[0])) return;
$merge = $attr[0];
$seen = array(); // recursion guard
// loop through all the inclusions
for ($i = 0; isset($merge[$i]); $i++) {
if (isset($seen[$merge[$i]])) continue;
$seen[$merge[$i]] = true;
// foreach attribute of the inclusion, copy it over
if (!isset($this->info[$merge[$i]])) continue;
foreach ($this->info[$merge[$i]] as $key => $value) {
if (isset($attr[$key])) continue; // also catches more inclusions
$attr[$key] = $value;
if (isset($this->info[$merge[$i]][0])) {
// recursion
$merge = array_merge($merge, $this->info[$merge[$i]][0]);
* Expands all string identifiers in an attribute array by replacing
* them with the appropriate values inside HTMLPurifier_AttrTypes
* @param &$attr Reference to attribute array
* @param $attr_types HTMLPurifier_AttrTypes instance
public function expandIdentifiers(&$attr, $attr_types) {
// because foreach will process new elements we add, make sure we
// skip duplicates
$processed = array();
foreach ($attr as $def_i => $def) {
// skip inclusions
if ($def_i === 0) continue;
if (isset($processed[$def_i])) continue;
// determine whether or not attribute is required
if ($required = (strpos($def_i, '*') !== false)) {
// rename the definition
$def_i = trim($def_i, '*');
$attr[$def_i] = $def;
$processed[$def_i] = true;
// if we've already got a literal object, move on
if (is_object($def)) {
// preserve previous required
$attr[$def_i]->required = ($required || $attr[$def_i]->required);
if ($def === false) {
if ($t = $attr_types->get($def)) {
$attr[$def_i] = $t;
$attr[$def_i]->required = $required;
} else {
* Base class for all validating attribute definitions.
* This family of classes forms the core for not only HTML attribute validation,
* but also any sort of string that needs to be validated or cleaned (which
* means CSS properties and composite definitions are defined here too).
* Besides defining (through code) what precisely makes the string valid,
* subclasses are also responsible for cleaning the code if possible.
abstract class HTMLPurifier_AttrDef
* Tells us whether or not an HTML attribute is minimized. Has no
* meaning in other contexts.
public $minimized = false;
* Tells us whether or not an HTML attribute is required. Has no
* meaning in other contexts
public $required = false;
* Validates and cleans passed string according to a definition.
* @param $string String to be validated and cleaned.
* @param $config Mandatory HTMLPurifier_Config object.
* @param $context Mandatory HTMLPurifier_AttrContext object.
abstract public function validate($string, $config, $context);
* Convenience method that parses a string as if it were CDATA.
* This method process a string in the manner specified at
* <> by removing
* leading and trailing whitespace, ignoring line feeds, and replacing
* carriage returns and tabs with spaces. While most useful for HTML
* attributes specified as CDATA, it can also be applied to most CSS
* values.
* @note This method is not entirely standards compliant, as trim() removes
* more types of whitespace than specified in the spec. In practice,
* this is rarely a problem, as those extra characters usually have
* already been removed by HTMLPurifier_Encoder.
* @warning This processing is inconsistent with XML's whitespace handling
* as specified by section 3.3.3 and referenced XHTML 1.0 section
* 4.7. Compliant processing requires all line breaks normalized
* to "\n", so the fix is not as simple as fixing it in this
* function. Trim and whitespace collapsing are supposed to only
* occur in NMTOKENs. However, note that we are NOT necessarily
* parsing XML, thus, this behavior may still be correct.
public function parseCDATA($string) {
$string = trim($string);
$string = str_replace("\n", '', $string);
$string = str_replace(array("\r", "\t"), ' ', $string);
return $string;
* Factory method for creating this class from a string.
* @param $string String construction info
* @return Created AttrDef object corresponding to $string
public function make($string) {
// default implementation, return flyweight of this object
// if overloaded, it is *necessary* for you to clone the
// object (usually by instantiating a new copy) and return that
return $this;
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/CSSDefinition.php';
* Validates the HTML attribute style, otherwise known as CSS.
* @note We don't implement the whole CSS specification, so it might be
* difficult to reuse this component in the context of validating
* actual stylesheet declarations.
* @note If we were really serious about validating the CSS, we would
* tokenize the styles and then parse the tokens. Obviously, we
* are not doing that. Doing that could seriously harm performance,
* but would make these components a lot more viable for a CSS
* filtering solution.
class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
public function validate($css, $config, $context) {
$css = $this->parseCDATA($css);
$definition = $config->getCSSDefinition();
// we're going to break the spec and explode by semicolons.
// This is because semicolon rarely appears in escaped form
// Doing this is generally flaky but fast
// for details
$declarations = explode(';', $css);
$propvalues = array();
foreach ($declarations as $declaration) {
if (!$declaration) continue;
if (!strpos($declaration, ':')) continue;
list($property, $value) = explode(':', $declaration, 2);
$property = trim($property);
$value = trim($value);
$ok = false;
do {
if (isset($definition->info[$property])) {
$ok = true;
if (ctype_lower($property)) break;
$property = strtolower($property);
if (isset($definition->info[$property])) {
$ok = true;
} while(0);
if (!$ok) continue;
// inefficient call, since the validator will do this again
if (strtolower(trim($value)) !== 'inherit') {
// inherit works for everything (but only on the base property)
$result = $definition->info[$property]->validate(
$value, $config, $context );
} else {
$result = 'inherit';
if ($result === false) continue;
$propvalues[$property] = $result;
// procedure does not write the new CSS simultaneously, so it's
// slightly inefficient, but it's the only way of getting rid of
// duplicates. Perhaps config to optimize it, but not now.
$new_declarations = '';
foreach ($propvalues as $prop => $value) {
$new_declarations .= "$prop:$value;";
return $new_declarations ? $new_declarations : false;
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/CSS/Number.php';
class HTMLPurifier_AttrDef_CSS_AlphaValue extends HTMLPurifier_AttrDef_CSS_Number
public function __construct() {
parent::__construct(false); // opacity is non-negative, but we will clamp it
public function validate($number, $config, $context) {
$result = parent::validate($number, $config, $context);
if ($result === false) return $result;
$float = (float) $result;
if ($float < 0.0) $result = '0';
if ($float > 1.0) $result = '1';
return $result;
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/CSSDefinition.php';
* Validates shorthand CSS property background.
* @warning Does not support url tokens that have internal spaces.
class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
* Local copy of component validators.
* @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
protected $info;
public function __construct($config) {
$def = $config->getCSSDefi