Initial commit

This commit is contained in:
2020-10-07 10:37:15 +02:00
commit ce5f440392
28157 changed files with 4429172 additions and 0 deletions

746
vendor/soundasleep/html2text/composer.lock generated vendored Normal file
View File

@@ -0,0 +1,746 @@
{
"_readme": [
"This file locks the dependencies of your project to a known state",
"Read more about it at http://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"hash": "bb90182bd8d808a06ae4762518746ed3",
"packages": [],
"packages-dev": [
{
"name": "justinrainbow/json-schema",
"version": "1.3.7",
"source": {
"type": "git",
"url": "https://github.com/justinrainbow/json-schema.git",
"reference": "87b54b460febed69726c781ab67462084e97a105"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/justinrainbow/json-schema/zipball/87b54b460febed69726c781ab67462084e97a105",
"reference": "87b54b460febed69726c781ab67462084e97a105",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"require-dev": {
"json-schema/json-schema-test-suite": "1.1.0",
"phpdocumentor/phpdocumentor": "~2",
"phpunit/phpunit": "~3.7"
},
"bin": [
"bin/validate-json"
],
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "1.4.x-dev"
}
},
"autoload": {
"psr-0": {
"JsonSchema": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Bruno Prieto Reis",
"email": "bruno.p.reis@gmail.com"
},
{
"name": "Justin Rainbow",
"email": "justin.rainbow@gmail.com"
},
{
"name": "Igor Wiedler",
"email": "igor@wiedler.ch"
},
{
"name": "Robert Schönthal",
"email": "seroscho@googlemail.com"
}
],
"description": "A library to validate a json schema.",
"homepage": "https://github.com/justinrainbow/json-schema",
"keywords": [
"json",
"schema"
],
"time": "2014-08-25 02:48:14"
},
{
"name": "phpunit/php-code-coverage",
"version": "2.0.15",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/php-code-coverage.git",
"reference": "34cc484af1ca149188d0d9e91412191e398e0b67"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/php-code-coverage/zipball/34cc484af1ca149188d0d9e91412191e398e0b67",
"reference": "34cc484af1ca149188d0d9e91412191e398e0b67",
"shasum": ""
},
"require": {
"php": ">=5.3.3",
"phpunit/php-file-iterator": "~1.3",
"phpunit/php-text-template": "~1.2",
"phpunit/php-token-stream": "~1.3",
"sebastian/environment": "~1.0",
"sebastian/version": "~1.0"
},
"require-dev": {
"ext-xdebug": ">=2.1.4",
"phpunit/phpunit": "~4"
},
"suggest": {
"ext-dom": "*",
"ext-xdebug": ">=2.2.1",
"ext-xmlwriter": "*"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "2.0.x-dev"
}
},
"autoload": {
"classmap": [
"src/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Sebastian Bergmann",
"email": "sb@sebastian-bergmann.de",
"role": "lead"
}
],
"description": "Library that provides collection, processing, and rendering functionality for PHP code coverage information.",
"homepage": "https://github.com/sebastianbergmann/php-code-coverage",
"keywords": [
"coverage",
"testing",
"xunit"
],
"time": "2015-01-24 10:06:35"
},
{
"name": "phpunit/php-file-iterator",
"version": "1.3.4",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/php-file-iterator.git",
"reference": "acd690379117b042d1c8af1fafd61bde001bf6bb"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/php-file-iterator/zipball/acd690379117b042d1c8af1fafd61bde001bf6bb",
"reference": "acd690379117b042d1c8af1fafd61bde001bf6bb",
"shasum": ""
},
"require": {
"php": ">=5.3.3"
},
"type": "library",
"autoload": {
"classmap": [
"File/"
]
},
"notification-url": "https://packagist.org/downloads/",
"include-path": [
""
],
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Sebastian Bergmann",
"email": "sb@sebastian-bergmann.de",
"role": "lead"
}
],
"description": "FilterIterator implementation that filters files based on a list of suffixes.",
"homepage": "https://github.com/sebastianbergmann/php-file-iterator/",
"keywords": [
"filesystem",
"iterator"
],
"time": "2013-10-10 15:34:57"
},
{
"name": "phpunit/php-text-template",
"version": "1.2.0",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/php-text-template.git",
"reference": "206dfefc0ffe9cebf65c413e3d0e809c82fbf00a"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/php-text-template/zipball/206dfefc0ffe9cebf65c413e3d0e809c82fbf00a",
"reference": "206dfefc0ffe9cebf65c413e3d0e809c82fbf00a",
"shasum": ""
},
"require": {
"php": ">=5.3.3"
},
"type": "library",
"autoload": {
"classmap": [
"Text/"
]
},
"notification-url": "https://packagist.org/downloads/",
"include-path": [
""
],
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Sebastian Bergmann",
"email": "sb@sebastian-bergmann.de",
"role": "lead"
}
],
"description": "Simple template engine.",
"homepage": "https://github.com/sebastianbergmann/php-text-template/",
"keywords": [
"template"
],
"time": "2014-01-30 17:20:04"
},
{
"name": "phpunit/php-timer",
"version": "1.0.5",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/php-timer.git",
"reference": "19689d4354b295ee3d8c54b4f42c3efb69cbc17c"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/php-timer/zipball/19689d4354b295ee3d8c54b4f42c3efb69cbc17c",
"reference": "19689d4354b295ee3d8c54b4f42c3efb69cbc17c",
"shasum": ""
},
"require": {
"php": ">=5.3.3"
},
"type": "library",
"autoload": {
"classmap": [
"PHP/"
]
},
"notification-url": "https://packagist.org/downloads/",
"include-path": [
""
],
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Sebastian Bergmann",
"email": "sb@sebastian-bergmann.de",
"role": "lead"
}
],
"description": "Utility class for timing",
"homepage": "https://github.com/sebastianbergmann/php-timer/",
"keywords": [
"timer"
],
"time": "2013-08-02 07:42:54"
},
{
"name": "phpunit/php-token-stream",
"version": "1.4.0",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/php-token-stream.git",
"reference": "db32c18eba00b121c145575fcbcd4d4d24e6db74"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/php-token-stream/zipball/db32c18eba00b121c145575fcbcd4d4d24e6db74",
"reference": "db32c18eba00b121c145575fcbcd4d4d24e6db74",
"shasum": ""
},
"require": {
"ext-tokenizer": "*",
"php": ">=5.3.3"
},
"require-dev": {
"phpunit/phpunit": "~4.2"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "1.4-dev"
}
},
"autoload": {
"classmap": [
"src/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Sebastian Bergmann",
"email": "sebastian@phpunit.de"
}
],
"description": "Wrapper around PHP's tokenizer extension.",
"homepage": "https://github.com/sebastianbergmann/php-token-stream/",
"keywords": [
"tokenizer"
],
"time": "2015-01-17 09:51:32"
},
{
"name": "phpunit/phpunit",
"version": "4.0.0",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/phpunit.git",
"reference": "b3a7c58bc39f01577f89d63da1ec578e1e993f1a"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/phpunit/zipball/b3a7c58bc39f01577f89d63da1ec578e1e993f1a",
"reference": "b3a7c58bc39f01577f89d63da1ec578e1e993f1a",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-pcre": "*",
"ext-reflection": "*",
"ext-spl": "*",
"php": ">=5.3.3",
"phpunit/php-code-coverage": ">=2.0.0,<2.1.0",
"phpunit/php-file-iterator": "~1.3.1",
"phpunit/php-text-template": "~1.2",
"phpunit/php-timer": "~1.0.2",
"phpunit/phpunit-mock-objects": ">=2.0.0,<2.1.0",
"sebastian/diff": "~1.1",
"sebastian/environment": "~1.0",
"sebastian/exporter": "~1.0.1",
"sebastian/version": "~1.0",
"symfony/yaml": "~2.0"
},
"suggest": {
"ext-json": "*",
"ext-simplexml": "*",
"ext-tokenizer": "*",
"phpunit/php-invoker": "~1.1"
},
"bin": [
"phpunit"
],
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "4.0.x-dev"
}
},
"autoload": {
"classmap": [
"src/"
]
},
"notification-url": "https://packagist.org/downloads/",
"include-path": [
"",
"../../symfony/yaml/"
],
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Sebastian Bergmann",
"email": "sebastian@phpunit.de",
"role": "lead"
}
],
"description": "The PHP Unit Testing framework.",
"homepage": "http://www.phpunit.de/",
"keywords": [
"phpunit",
"testing",
"xunit"
],
"time": "2014-03-07 07:00:44"
},
{
"name": "phpunit/phpunit-mock-objects",
"version": "2.0.10",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/phpunit-mock-objects.git",
"reference": "e60bb929c50ae4237aaf680a4f6773f4ee17f0a2"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/phpunit-mock-objects/zipball/e60bb929c50ae4237aaf680a4f6773f4ee17f0a2",
"reference": "e60bb929c50ae4237aaf680a4f6773f4ee17f0a2",
"shasum": ""
},
"require": {
"php": ">=5.3.3",
"phpunit/php-text-template": "~1.2"
},
"require-dev": {
"phpunit/phpunit": ">=4.0.0,<4.1.0"
},
"suggest": {
"ext-soap": "*"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "2.0.x-dev"
}
},
"autoload": {
"classmap": [
"src/"
]
},
"notification-url": "https://packagist.org/downloads/",
"include-path": [
""
],
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Sebastian Bergmann",
"email": "sb@sebastian-bergmann.de",
"role": "lead"
}
],
"description": "Mock Object library for PHPUnit",
"homepage": "https://github.com/sebastianbergmann/phpunit-mock-objects/",
"keywords": [
"mock",
"xunit"
],
"time": "2014-06-12 07:19:48"
},
{
"name": "sebastian/diff",
"version": "1.2.0",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/diff.git",
"reference": "5843509fed39dee4b356a306401e9dd1a931fec7"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/diff/zipball/5843509fed39dee4b356a306401e9dd1a931fec7",
"reference": "5843509fed39dee4b356a306401e9dd1a931fec7",
"shasum": ""
},
"require": {
"php": ">=5.3.3"
},
"require-dev": {
"phpunit/phpunit": "~4.2"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "1.2-dev"
}
},
"autoload": {
"classmap": [
"src/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Kore Nordmann",
"email": "mail@kore-nordmann.de"
},
{
"name": "Sebastian Bergmann",
"email": "sebastian@phpunit.de"
}
],
"description": "Diff implementation",
"homepage": "http://www.github.com/sebastianbergmann/diff",
"keywords": [
"diff"
],
"time": "2014-08-15 10:29:00"
},
{
"name": "sebastian/environment",
"version": "1.2.1",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/environment.git",
"reference": "6e6c71d918088c251b181ba8b3088af4ac336dd7"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/environment/zipball/6e6c71d918088c251b181ba8b3088af4ac336dd7",
"reference": "6e6c71d918088c251b181ba8b3088af4ac336dd7",
"shasum": ""
},
"require": {
"php": ">=5.3.3"
},
"require-dev": {
"phpunit/phpunit": "~4.3"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "1.2.x-dev"
}
},
"autoload": {
"classmap": [
"src/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Sebastian Bergmann",
"email": "sebastian@phpunit.de"
}
],
"description": "Provides functionality to handle HHVM/PHP environments",
"homepage": "http://www.github.com/sebastianbergmann/environment",
"keywords": [
"Xdebug",
"environment",
"hhvm"
],
"time": "2014-10-25 08:00:45"
},
{
"name": "sebastian/exporter",
"version": "1.0.2",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/exporter.git",
"reference": "c7d59948d6e82818e1bdff7cadb6c34710eb7dc0"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/exporter/zipball/c7d59948d6e82818e1bdff7cadb6c34710eb7dc0",
"reference": "c7d59948d6e82818e1bdff7cadb6c34710eb7dc0",
"shasum": ""
},
"require": {
"php": ">=5.3.3"
},
"require-dev": {
"phpunit/phpunit": "~4.0"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "1.0.x-dev"
}
},
"autoload": {
"classmap": [
"src/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Jeff Welch",
"email": "whatthejeff@gmail.com"
},
{
"name": "Volker Dusch",
"email": "github@wallbash.com"
},
{
"name": "Bernhard Schussek",
"email": "bschussek@2bepublished.at"
},
{
"name": "Sebastian Bergmann",
"email": "sebastian@phpunit.de"
},
{
"name": "Adam Harvey",
"email": "aharvey@php.net"
}
],
"description": "Provides the functionality to export PHP variables for visualization",
"homepage": "http://www.github.com/sebastianbergmann/exporter",
"keywords": [
"export",
"exporter"
],
"time": "2014-09-10 00:51:36"
},
{
"name": "sebastian/version",
"version": "1.0.4",
"source": {
"type": "git",
"url": "https://github.com/sebastianbergmann/version.git",
"reference": "a77d9123f8e809db3fbdea15038c27a95da4058b"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/sebastianbergmann/version/zipball/a77d9123f8e809db3fbdea15038c27a95da4058b",
"reference": "a77d9123f8e809db3fbdea15038c27a95da4058b",
"shasum": ""
},
"type": "library",
"autoload": {
"classmap": [
"src/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Sebastian Bergmann",
"email": "sebastian@phpunit.de",
"role": "lead"
}
],
"description": "Library that helps with managing the version number of Git-hosted PHP projects",
"homepage": "https://github.com/sebastianbergmann/version",
"time": "2014-12-15 14:25:24"
},
{
"name": "soundasleep/component-tests",
"version": "dev-master",
"source": {
"type": "git",
"url": "https://github.com/soundasleep/component-tests.git",
"reference": "c1e9e6608ef41452b35d68a7057ea7add77ccd36"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/soundasleep/component-tests/zipball/c1e9e6608ef41452b35d68a7057ea7add77ccd36",
"reference": "c1e9e6608ef41452b35d68a7057ea7add77ccd36",
"shasum": ""
},
"require": {
"justinrainbow/json-schema": "~1.3",
"phpunit/phpunit": "4.0"
},
"type": "library",
"autoload": {
"psr-4": {
"ComponentTests\\": "src"
}
},
"notification-url": "https://packagist.org/downloads/",
"description": "Common Composer and PHP component lint and validation tests",
"time": "2015-03-18 02:30:24"
},
{
"name": "symfony/yaml",
"version": "v2.6.5",
"target-dir": "Symfony/Component/Yaml",
"source": {
"type": "git",
"url": "https://github.com/symfony/Yaml.git",
"reference": "0cd8e72071e46e15fc072270ae39ea1b66b10a9d"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/Yaml/zipball/0cd8e72071e46e15fc072270ae39ea1b66b10a9d",
"reference": "0cd8e72071e46e15fc072270ae39ea1b66b10a9d",
"shasum": ""
},
"require": {
"php": ">=5.3.3"
},
"require-dev": {
"symfony/phpunit-bridge": "~2.7"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "2.6-dev"
}
},
"autoload": {
"psr-0": {
"Symfony\\Component\\Yaml\\": ""
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Symfony Community",
"homepage": "http://symfony.com/contributors"
},
{
"name": "Fabien Potencier",
"email": "fabien@symfony.com"
}
],
"description": "Symfony Yaml Component",
"homepage": "http://symfony.com",
"time": "2015-03-12 10:28:44"
}
],
"aliases": [],
"minimum-stability": "stable",
"stability-flags": {
"soundasleep/component-tests": 20
},
"prefer-stable": false,
"prefer-lowest": false,
"platform": {
"php": ">=5.3.2",
"ext-dom": "*",
"ext-libxml": "*"
},
"platform-dev": []
}

View File

@@ -0,0 +1,38 @@
<?php
/******************************************************************************
* Copyright (c) 2010 Jevon Wright and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* or
*
* LGPL which is available at http://www.gnu.org/licenses/lgpl.html
*
*
* Contributors:
* Jevon Wright - initial API and implementation
****************************************************************************/
/**
* This file allows you to convert through the command line.
* Usage:
* php -f convert.php [input file]
*/
if (count($argv) < 2) {
throw new \InvalidArgumentException("Expected: php -f convert.php [input file]");
}
if (!file_exists($argv[1])) {
throw new \InvalidArgumentException("'" . $argv[1] . "' does not exist");
}
$input = file_get_contents($argv[1]);
require_once(__DIR__ . "/src/Html2Text.php");
require_once(__DIR__ . "/src/Html2TextException.php");
echo Html2Text\Html2Text::convert($input);

View File

@@ -0,0 +1,32 @@
<?php
/******************************************************************************
* Copyright (c) 2010 Jevon Wright and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* or
*
* LGPL which is available at http://www.gnu.org/licenses/lgpl.html
*
*
* Contributors:
* Jevon Wright - initial API and implementation
****************************************************************************/
/**
* This file is available if you still want to use functions rather than
* autoloading classes.
*/
require_once(__DIR__ . "/src/Html2Text.php");
require_once(__DIR__ . "/src/Html2TextException.php");
function convert_html_to_text($html) {
return Html2Text\Html2Text::convert($html);
}
function fix_newlines($text) {
return Html2Text\Html2Text::fixNewlines($text);
}

View File

@@ -0,0 +1,394 @@
<?php
/******************************************************************************
* Copyright (c) 2010 Jevon Wright and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* or
*
* LGPL which is available at http://www.gnu.org/licenses/lgpl.html
*
*
* Contributors:
* Jevon Wright - initial API and implementation
****************************************************************************/
namespace Html2Text;
class Html2Text {
/**
* Tries to convert the given HTML into a plain text format - best suited for
* e-mail display, etc.
*
* <p>In particular, it tries to maintain the following features:
* <ul>
* <li>Links are maintained, with the 'href' copied over
* <li>Information in the &lt;head&gt; is lost
* </ul>
*
* @param string $html the input HTML
* @param boolean $ignore_error Ignore xml parsing errors
* @return string the HTML converted, as best as possible, to text
* @throws Html2TextException if the HTML could not be loaded as a {@link DOMDocument}
*/
public static function convert($html, $ignore_error = false) {
// replace &nbsp; with spaces
$html = str_replace("&nbsp;", " ", $html);
$html = str_replace("\xc2\xa0", " ", $html);
$is_office_document = static::isOfficeDocument($html);
if ($is_office_document) {
// remove office namespace
$html = str_replace(array("<o:p>", "</o:p>"), "", $html);
}
$html = static::fixNewlines($html);
if (mb_detect_encoding($html, "UTF-8", true)) {
$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
}
$doc = static::getDocument($html, $ignore_error);
$output = static::iterateOverNode($doc, null, false, $is_office_document);
// remove leading and trailing spaces on each line
$output = preg_replace("/[ \t]*\n[ \t]*/im", "\n", $output);
$output = preg_replace("/ *\t */im", "\t", $output);
// unarmor pre blocks
$output = str_replace("\r", "\n", $output);
// remove unnecessary empty lines
$output = preg_replace("/\n\n\n*/im", "\n\n", $output);
// remove leading and trailing whitespace
$output = trim($output);
return $output;
}
/**
* Unify newlines; in particular, \r\n becomes \n, and
* then \r becomes \n. This means that all newlines (Unix, Windows, Mac)
* all become \ns.
*
* @param string $text text with any number of \r, \r\n and \n combinations
* @return string the fixed text
*/
static function fixNewlines($text) {
// replace \r\n to \n
$text = str_replace("\r\n", "\n", $text);
// remove \rs
$text = str_replace("\r", "\n", $text);
return $text;
}
/**
* Parse HTML into a DOMDocument
*
* @param string $html the input HTML
* @param boolean $ignore_error Ignore xml parsing errors
* @return DOMDocument the parsed document tree
*/
static function getDocument($html, $ignore_error = false) {
$doc = new \DOMDocument();
$html = trim($html);
if (!$html) {
// DOMDocument doesn't support empty value and throws an error
// Return empty document instead
return $doc;
}
if ($html[0] !== '<') {
// If HTML does not begin with a tag, we put a body tag around it.
// If we do not do this, PHP will insert a paragraph tag around
// the first block of text for some reason which can mess up
// the newlines. See pre.html test for an example.
$html = '<body>' . $html . '</body>';
}
if ($ignore_error) {
$doc->strictErrorChecking = false;
$doc->recover = true;
$doc->xmlStandalone = true;
$old_internal_errors = libxml_use_internal_errors(true);
$load_result = $doc->loadHTML($html, LIBXML_NOWARNING | LIBXML_NOERROR | LIBXML_NONET);
libxml_use_internal_errors($old_internal_errors);
}
else {
$load_result = $doc->loadHTML($html);
}
if (!$load_result) {
throw new Html2TextException("Could not load HTML - badly formed?", $html);
}
return $doc;
}
/**
* Can we guess that this HTML is generated by Microsoft Office?
*/
static function isOfficeDocument($html) {
return strpos($html, "urn:schemas-microsoft-com:office") !== false;
}
static function isWhitespace($text) {
return strlen(trim($text, "\n\r\t ")) === 0;
}
static function nextChildName($node) {
// get the next child
$nextNode = $node->nextSibling;
while ($nextNode != null) {
if ($nextNode instanceof \DOMText) {
if (!static::isWhitespace($nextNode->wholeText)) {
break;
}
}
if ($nextNode instanceof \DOMElement) {
break;
}
$nextNode = $nextNode->nextSibling;
}
$nextName = null;
if (($nextNode instanceof \DOMElement || $nextNode instanceof \DOMText) && $nextNode != null) {
$nextName = strtolower($nextNode->nodeName);
}
return $nextName;
}
static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false) {
if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
if ($in_pre) {
$text = "\n" . trim($node->wholeText, "\n\r\t ") . "\n";
// Remove trailing whitespace only
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
// armor newlines with \r.
return str_replace("\n", "\r", $text);
} else {
$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
return "\n" . $text;
}
return $text;
}
}
if ($node instanceof \DOMDocumentType) {
// ignore
return "";
}
if ($node instanceof \DOMProcessingInstruction) {
// ignore
return "";
}
$name = strtolower($node->nodeName);
$nextName = static::nextChildName($node);
// start whitespace
switch ($name) {
case "hr":
$prefix = '';
if ($prevName != null) {
$prefix = "\n";
}
return $prefix . "---------------------------------------------------------------\n";
case "style":
case "head":
case "title":
case "meta":
case "script":
// ignore these tags
return "";
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
case "ol":
case "ul":
// add two newlines, second line is added below
$output = "\n";
break;
case "td":
case "th":
// add tab char to separate table fields
$output = "\t";
break;
case "p":
// Microsoft exchange emails often include HTML which, when passed through
// html2text, results in lots of double line returns everywhere.
//
// To fix this, for any p element with a className of `MsoNormal` (the standard
// classname in any Microsoft export or outlook for a paragraph that behaves
// like a line return) we skip the first line returns and set the name to br.
if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
$output = "";
$name = 'br';
break;
}
// add two lines
$output = "\n\n";
break;
case "pre":
case "tr":
case "div":
// add one line
$output = "\n";
break;
case "li":
$output = "- ";
break;
default:
// print out contents of unknown tags
$output = "";
break;
}
// debug
//$output .= "[$name,$nextName]";
if (isset($node->childNodes)) {
$n = $node->childNodes->item(0);
$previousSiblingName = null;
while($n != null) {
$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document);
// Pass current node name to next child, as previousSibling does not appear to get populated
if ($n instanceof \DOMDocumentType
|| $n instanceof \DOMProcessingInstruction
|| ($n instanceof \DOMText && static::isWhitespace($text))) {
// Keep current previousSiblingName, these are invisible
}
else {
$previousSiblingName = strtolower($n->nodeName);
}
$node->removeChild($n);
$n = $node->childNodes->item(0);
// suppress last br tag inside a node list
if ($n != null || $previousSiblingName != 'br') {
$output .= $text;
}
}
}
// end whitespace
switch ($name) {
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
$output .= "\n";
break;
case "p":
// add two lines
$output .= "\n\n";
break;
case "pre":
case "br":
// add one line
$output .= "\n";
break;
case "div":
break;
case "a":
// links are returned in [text](link) format
$href = $node->getAttribute("href");
$output = trim($output);
// remove double [[ ]] s from linking images
if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
$output = substr($output, 1, strlen($output) - 2);
// for linking images, the title of the <a> overrides the title of the <img>
if ($node->getAttribute("title")) {
$output = $node->getAttribute("title");
}
}
// if there is no link text, but a title attr
if (!$output && $node->getAttribute("title")) {
$output = $node->getAttribute("title");
}
if ($href == null) {
// it doesn't link anywhere
if ($node->getAttribute("name") != null) {
$output = "[$output]";
}
} else {
if ($href == $output || $href == "mailto:$output" || $href == "http://$output" || $href == "https://$output") {
// link to the same address: just use link
$output;
} else {
// replace it
if ($output) {
$output = "[$output]($href)";
} else {
// empty string
$output = $href;
}
}
}
// does the next node require additional whitespace?
switch ($nextName) {
case "h1": case "h2": case "h3": case "h4": case "h5": case "h6":
$output .= "\n";
break;
}
break;
case "img":
if ($node->getAttribute("title")) {
$output = "[" . $node->getAttribute("title") . "]";
} elseif ($node->getAttribute("alt")) {
$output = "[" . $node->getAttribute("alt") . "]";
} else {
$output = "";
}
break;
case "li":
$output .= "\n";
break;
default:
// do nothing
}
return $output;
}
}

View File

@@ -0,0 +1,28 @@
<?php
/******************************************************************************
* Copyright (c) 2010 Jevon Wright and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* or
*
* LGPL which is available at http://www.gnu.org/licenses/lgpl.html
*
*
* Contributors:
* Jevon Wright - initial API and implementation
****************************************************************************/
namespace Html2Text;
class Html2TextException extends \Exception {
var $more_info;
public function __construct($message = "", $more_info = "") {
parent::__construct($message);
$this->more_info = $more_info;
}
}