Files
domokits-NEW/domokits/local/modules/TheliaBlocks/Service/HtmlParserService.php

448 lines
14 KiB
PHP

<?php
/*
* This file is part of the Thelia package.
* http://www.thelia.net
*
* (c) OpenStudio <info@thelia.net>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/
namespace TheliaBlocks\Service;
use OpenApi\Model\Api\ModelFactory;
use Symfony\Component\HttpFoundation\RequestStack;
use TheliaLibrary\Model\LibraryImage;
use TheliaLibrary\TheliaLibrary;
class HtmlParserService
{
/** @var ModelFactory */
protected $modelFactory;
protected $locale;
public function __construct(ModelFactory $modelFactory, RequestStack $requestStack)
{
$this->modelFactory = $modelFactory;
$this->locale = $requestStack->getCurrentRequest()->getSession()->getLang(true)->getLocale();
}
public function htmlToJsonBlocks($html, $mediaBaseUrl = null)
{
$domDocument = new \DOMDocument();
libxml_use_internal_errors(true);
$domDocument->loadHTML($html);
$tags = $this->htmlToTags($domDocument->documentElement)['children'][0]['children'];
$blocks = array_map(function ($tag) use ($mediaBaseUrl) {return $this->tagToBlocks($tag, null, $mediaBaseUrl); }, $tags);
$blocks = array_reduce(
$blocks,
[$this, 'reduceBlocks'], []);
return json_encode($blocks);
}
protected function reduceBlocks($carry, $block)
{
$currentBlockType = $block['type']['id'] ?? null;
if ($currentBlockType === 'blockGroup') {
$block['data'] = array_reduce($block['data'], [$this, 'reduceBlocks'], []);
if (\count($block['data']) > 1) {
$carry[] = $block;
return $carry;
}
$block = $block['data'][0];
}
$previousBlockType = $carry[array_key_last($carry)]['type']['id'] ?? null;
// If current and previous are block text merge it
if ('blockText' === $previousBlockType && 'blockText' === $currentBlockType) {
$carry[array_key_last($carry)]['data']['value'] = $carry[array_key_last($carry)]['data']['value'].$block['data']['value'];
return $carry;
}
// If current is a "space" separator and previous is block text add a br to previous
if ('blockText' === $previousBlockType && 'blockSeparator' === $currentBlockType && $block['data']['type'] === 'space') {
$carry[array_key_last($carry)]['data']['value'] = $carry[array_key_last($carry)]['data']['value'].'<br/>';
return $carry;
}
// If current is a block text and previous is a "space" separator, replace previous by current and add a br at start of it
if ('blockSeparator' === $previousBlockType && $carry[array_key_last($carry)]['data']['type'] === 'space' && 'blockText' === $currentBlockType) {
$block['data']['value'] = '<br/>'.$block['data']['value'];
$carry[array_key_last($carry)] = $block;
return $carry;
}
// If current and previous are "space" separator increase size of previous
if ('blockSeparator' === $previousBlockType && 'blockSeparator' === $currentBlockType) {
$carry[array_key_last($carry)]['data']['size'] = $carry[array_key_last($carry)]['data']['size']++;
return $carry;
}
$carry[] = $block;
return $carry;
}
protected function htmlToTags($element)
{
if (!property_exists($element, 'tagName')) {
return null;
}
$obj = ['type' => $element->tagName];
if (!\in_array($element->tagName, ['html', 'body'])) {
$obj['raw'] = $element->ownerDocument->saveXML($element);
}
foreach ($element->attributes as $attribute) {
$obj[$attribute->name] = $attribute->value;
}
foreach ($element->childNodes as $subElement) {
if ($subElement->nodeType == \XML_TEXT_NODE) {
if ('' != trim($subElement->wholeText)) {
$obj['children'][] = $subElement->wholeText;
}
} else {
$child = $this->htmlToTags($subElement);
if (null !== $child) {
$obj['children'][] = $child;
}
}
}
return $obj;
}
protected function tagToBlocks($tag, $parentId = null, $mediaBaseUrl = null)
{
$blockId = $this->guidv4();
$blockBaseData = [
'id' => $blockId,
'parent' => $parentId,
];
// String without tags are text
if (\is_string($tag)) {
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockText'],
'data' => [
'value' => $tag,
],
]
);
}
// Orphan tag that are not implemented
if (!\in_array($tag['type'], ['img', 'iframe', 'br', 'hr', 'embed']) && !isset($tag['children'])) {
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockRaw'],
'data' => [
'value' => $tag['raw'],
],
]
);
}
$childrenBlocks = isset($tag['children'])
? array_map(function ($tag) use ($blockId, $mediaBaseUrl) {return $this->tagToBlocks($tag, $blockId, $mediaBaseUrl); }, $tag['children'])
: [];
// Title tags
if (\in_array($tag['type'], ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])) {
$text = array_reduce(
$tag['children'],
[$this, 'mergeChildText'],
''
);
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockTitle'],
'data' => [
'level' => substr($tag['type'], 1, 1),
'text' => strip_tags($text),
],
]
);
}
if (\in_array($tag['type'], ['ol', 'ul'])) {
// Get li as raw text
$values = array_filter(array_map(
function ($child) {
return $child['data']['value'] ?? null;
},
$childrenBlocks
), 'strlen');
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockList'],
'data' => [
'type' => $tag['type'],
'values' => $values,
],
]
);
}
if ($tag['type'] === 'li') {
$text = array_reduce(
$tag['children'],
[$this, 'mergeChildText'],
''
);
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockText'],
'data' => [
'value' => $text,
],
]
);
}
if ($tag['type'] === 'figure') {
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockGroup'],
'data' => $childrenBlocks,
]
);
}
if ($tag['type'] === 'figcaption') {
$text = array_reduce(
$tag['children'],
[$this, 'mergeChildText'],
''
);
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockText'],
'data' => [
'value' => $text,
],
]
);
}
if ($tag['type'] === 'img') {
$alt = $tag['alt'] ?? $tag['title'] ?? '';
$imgData = [];
if (isset($tag['width'])) {
$imgData['width'] = $tag['width'];
}
if (isset($tag['height'])) {
$imgData['height'] = $tag['height'];
}
$imgDataFromUrl = null !== $mediaBaseUrl
? $this->fileGetContentsCurl($mediaBaseUrl.$tag['src'])
: null;
// If we found img by his url upload it to library
if ($imgDataFromUrl) {
$fileName = bin2hex(random_bytes(5)).'_'.basename($tag['src']);
file_put_contents(TheliaLibrary::DEFAULT_IMAGE_DIRECTORY.$fileName, $imgDataFromUrl);
$libraryImage = (new LibraryImage())
->setLocale($this->locale)
->setFileName($fileName)
->setTitle($alt ?? $fileName);
$libraryImage->save();
$openApiImage = $this->modelFactory->buildModel('LibraryImage', $libraryImage);
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockImage'],
'data' => array_merge($imgData, json_decode(json_encode($openApiImage), true)),
]
);
}
// Else set the src value
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockImage'],
'data' => array_merge(
$imgData,
[
'src' => $tag['src'],
'alt' => $alt,
]
),
]
);
}
if ($tag['type'] === 'p') {
if (isset($tag['children']) && !empty($tag['children'])) {
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockGroup'],
'data' => $childrenBlocks,
]
);
}
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockText'],
'data' => [
'value' => $tag['raw'],
],
]
);
}
if (\in_array($tag['type'], ['strong', 'span'])) {
$text = array_reduce(
$tag['children'],
[$this, 'mergeChildText'],
''
);
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockText'],
'data' => [
'value' => '<'.$tag['type'].'>'.$text.'</'.$tag['type'].'>',
],
]
);
}
if ($tag['type'] === 'a') {
$imgBlocks = array_filter($childrenBlocks, function ($block) {return $block['type']['id'] === 'blockImage'; });
// If a link has an img inside it's an image block with a link
if (isset($imgBlocks[0])) {
$linkData = ['url' => $tag['href']];
if (isset($tag['target'])) {
$linkData['target'] = $tag['target'];
}
$blockData = array_merge(
$imgBlocks[0]['data'],
[
'link' => $linkData,
]
);
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockImage'],
'data' => $blockData,
]
);
}
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockText'],
'data' => [
'value' => $tag['raw'],
],
]
);
}
if (\in_array($tag['type'], ['hr', 'br'])) {
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockSeparator'],
'data' => [
'type' => $tag['type'] === 'hr' ? 'border' : 'space',
'size' => 1,
],
]
);
}
// Fix not closed iframes (remove closed tag if exist the add it to be sure all iframes has closing tags
if ($tag['type'] === 'iframe') {
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockRaw'],
'data' => [
'value' => str_replace('</iframe>', '', $tag['raw']).'</iframe>',
],
]
);
}
return array_merge(
$blockBaseData,
[
'type' => ['id' => 'blockRaw'],
'data' => [
'value' => $tag['raw'],
],
]
);
}
protected function mergeChildText($carry, $child)
{
$childText = \is_string($child) ? $child : $child['raw'];
return $carry.$childText;
}
protected function guidv4($data = null)
{
// Generate 16 bytes (128 bits) of random data or use the data passed into the function.
$data = $data ?? random_bytes(16);
\assert(\strlen($data) == 16);
// Set version to 0100
$data[6] = \chr(\ord($data[6]) & 0x0F | 0x40);
// Set bits 6-7 to 10
$data[8] = \chr(\ord($data[8]) & 0x3F | 0x80);
// Output the 36 character UUID.
return vsprintf('%s%s-%s-%s-%s-%s%s%s', str_split(bin2hex($data), 4));
}
protected function fileGetContentsCurl($url)
{
$ch = curl_init();
curl_setopt($ch, \CURLOPT_HEADER, 0);
curl_setopt($ch, \CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, \CURLOPT_URL, $url);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
}