diff --git a/Classes/PHPExcel/Reader/HTML.php b/Classes/PHPExcel/Reader/HTML.php index a19eaec9d..0ac550f9a 100644 --- a/Classes/PHPExcel/Reader/HTML.php +++ b/Classes/PHPExcel/Reader/HTML.php @@ -12,6 +12,7 @@ * PHPExcel_Reader_HTML * * Copyright (c) 2006 - 2015 PHPExcel + * Copyright (c) 2015 Wine Logistix GmbH * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -30,11 +31,12 @@ * @category PHPExcel * @package PHPExcel_Reader * @copyright Copyright (c) 2006 - 2015 PHPExcel (http://www.codeplex.com/PHPExcel) + * @copyright Copyright (c) 2015 Wine Logistix GmbH (http://www.wine-logistix.de) * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt LGPL * @version ##VERSION##, ##DATE## */ /** PHPExcel root directory */ -class PHPExcel_Reader_HTML extends PHPExcel_Reader_Abstract implements PHPExcel_Reader_IReader +class PHPExcel_Reader_HTML extends PHPExcel_Reader_HTML_Abstract { /** @@ -123,39 +125,6 @@ public function __construct() $this->readFilter = new PHPExcel_Reader_DefaultReadFilter(); } - /** - * Validate that the current file is an HTML file - * - * @return boolean - */ - protected function isValidFormat() - { - // Reading 2048 bytes should be enough to validate that the format is HTML - $data = fread($this->fileHandle, 2048); - if ((strpos($data, '<') !== false) && - (strlen($data) !== strlen(strip_tags($data)))) { - return true; - } - - return false; - } - - /** - * Loads PHPExcel from file - * - * @param string $pFilename - * @return PHPExcel - * @throws PHPExcel_Reader_Exception - */ - public function load($pFilename) - { - // Create new PHPExcel - $objPHPExcel = new PHPExcel(); - - // Load into this instance - return $this->loadIntoExisting($pFilename, $objPHPExcel); - } - /** * Set input encoding * @@ -183,6 +152,12 @@ public function getInputEncoding() protected $tableLevel = 0; protected $nestedColumn = array('A'); + /** + * Active Worksheet which is used for writing to. + * @var PHPExcel_Worksheet + */ + protected $sheet; + protected function setTableStartColumn($column) { if ($this->tableLevel == 0) { @@ -206,7 +181,21 @@ protected function releaseTableStartColumn() return array_pop($this->nestedColumn); } - protected function flushCell($sheet, $column, $row, &$cellContent) + protected function loadHandler(PHPExcel $objPHPExcel) + { + // Create new PHPExcel worksheets. + while ($objPHPExcel->getSheetCount() <= $this->sheetIndex) { + $objPHPExcel->createSheet(); + } + $objPHPExcel->setActiveSheetIndex($this->sheetIndex); + $this->sheet = $objPHPExcel->getActiveSheet(); + } + + protected function finishHandler() + { + } + + protected function flushCell($column, $row, &$cellContent) { if (is_string($cellContent)) { // Simple String content @@ -215,7 +204,7 @@ protected function flushCell($sheet, $column, $row, &$cellContent) // echo 'FLUSH CELL: ' , $column , $row , ' => ' , $cellContent , '
'; // Write to worksheet to be done here... // ... we return the cell so we can mess about with styles more easily - $sheet->setCellValue($column . $row, $cellContent, true); + $this->sheet->setCellValue($column . $row, $cellContent, true); $this->dataArray[$row][$column] = $cellContent; } } else { @@ -226,28 +215,29 @@ protected function flushCell($sheet, $column, $row, &$cellContent) $cellContent = (string) ''; } - protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, &$cellContent, $format = null) + protected function textElementHandler(DOMNode $element, &$row, &$column, &$cellContent) { - foreach ($element->childNodes as $child) { - if ($child instanceof DOMText) { - $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue)); - if (is_string($cellContent)) { - // simply append the text if the cell content is a plain text string - $cellContent .= $domText; - } else { - // but if we have a rich text run instead, we need to append it correctly - // TODO - } - } elseif ($child instanceof DOMElement) { -// echo 'DOM ELEMENT: ' , strtoupper($child->nodeName) , '
'; + $domText = preg_replace('/\s+/u', ' ', trim($element->nodeValue)); + if (is_string($cellContent)) { + // simply append the text if the cell content is a plain text string + $cellContent .= $domText; + } else { + // but if we have a rich text run instead, we need to append it correctly + // TODO + } + } + + protected function defaultElementHandler(DOMNode $element, &$row, &$column, &$cellContent, $format = null) + { +// echo 'DOM ELEMENT: ' , strtoupper($element->nodeName) , '
'; $attributeArray = array(); - foreach ($child->attributes as $attribute) { + foreach ($element->attributes as $attribute) { // echo 'ATTRIBUTE: ' , $attribute->name , ' => ' , $attribute->value , '
'; $attributeArray[$attribute->name] = $attribute->value; } - switch ($child->nodeName) { + switch ($element->nodeName) { case 'meta': foreach ($attributeArray as $attributeName => $attributeValue) { switch ($attributeName) { @@ -257,11 +247,11 @@ protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, break; } } - $this->processDomElement($child, $sheet, $row, $column, $cellContent); + $this->processDomElement($element, $row, $column, $cellContent); break; case 'title': - $this->processDomElement($child, $sheet, $row, $column, $cellContent); - $sheet->setTitle($cellContent); + $this->processDomElement($element, $row, $column, $cellContent); + $this->sheet->setTitle($cellContent); $cellContent = ''; break; case 'span': @@ -275,20 +265,20 @@ protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, if ($cellContent > '') { $cellContent .= ' '; } - $this->processDomElement($child, $sheet, $row, $column, $cellContent); + $this->processDomElement($element, $row, $column, $cellContent); if ($cellContent > '') { $cellContent .= ' '; } // echo 'END OF STYLING, SPAN OR DIV
'; break; case 'hr': - $this->flushCell($sheet, $column, $row, $cellContent); + $this->flushCell($column, $row, $cellContent); ++$row; - if (isset($this->formats[$child->nodeName])) { - $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]); + if (isset($this->formats[$element->nodeName])) { + $this->sheet->getStyle($column . $row)->applyFromArray($this->formats[$element->nodeName]); } else { $cellContent = '----------'; - $this->flushCell($sheet, $column, $row, $cellContent); + $this->flushCell($column, $row, $cellContent); } ++$row; // Add a break after a horizontal rule, simply by allowing the code to dropthru @@ -298,7 +288,7 @@ protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, $cellContent .= "\n"; } else { // Otherwise flush our existing content and move the row cursor on - $this->flushCell($sheet, $column, $row, $cellContent); + $this->flushCell($column, $row, $cellContent); ++$row; } // echo 'HARD LINE BREAK: ' , '
'; @@ -309,15 +299,15 @@ protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, switch ($attributeName) { case 'href': // echo 'Link to ' , $attributeValue , '
'; - $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue); - if (isset($this->formats[$child->nodeName])) { - $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]); + $this->sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue); + if (isset($this->formats[$element->nodeName])) { + $this->sheet->getStyle($column . $row)->applyFromArray($this->formats[$element->nodeName]); } break; } } $cellContent .= ' '; - $this->processDomElement($child, $sheet, $row, $column, $cellContent); + $this->processDomElement($element, $row, $column, $cellContent); // echo 'END OF HYPERLINK:' , '
'; break; case 'h1': @@ -333,20 +323,20 @@ protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, // If we're inside a table, replace with a \n $cellContent .= "\n"; // echo 'LIST ENTRY: ' , '
'; - $this->processDomElement($child, $sheet, $row, $column, $cellContent); + $this->processDomElement($element, $row, $column, $cellContent); // echo 'END OF LIST ENTRY:' , '
'; } else { if ($cellContent > '') { - $this->flushCell($sheet, $column, $row, $cellContent); + $this->flushCell($column, $row, $cellContent); $row++; } // echo 'START OF PARAGRAPH: ' , '
'; - $this->processDomElement($child, $sheet, $row, $column, $cellContent); + $this->processDomElement($element, $row, $column, $cellContent); // echo 'END OF PARAGRAPH:' , '
'; - $this->flushCell($sheet, $column, $row, $cellContent); + $this->flushCell($column, $row, $cellContent); - if (isset($this->formats[$child->nodeName])) { - $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]); + if (isset($this->formats[$element->nodeName])) { + $this->sheet->getStyle($column . $row)->applyFromArray($this->formats[$element->nodeName]); } $row++; @@ -358,28 +348,28 @@ protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, // If we're inside a table, replace with a \n $cellContent .= "\n"; // echo 'LIST ENTRY: ' , '
'; - $this->processDomElement($child, $sheet, $row, $column, $cellContent); + $this->processDomElement($element, $row, $column, $cellContent); // echo 'END OF LIST ENTRY:' , '
'; } else { if ($cellContent > '') { - $this->flushCell($sheet, $column, $row, $cellContent); + $this->flushCell($column, $row, $cellContent); } ++$row; // echo 'LIST ENTRY: ' , '
'; - $this->processDomElement($child, $sheet, $row, $column, $cellContent); + $this->processDomElement($element, $row, $column, $cellContent); // echo 'END OF LIST ENTRY:' , '
'; - $this->flushCell($sheet, $column, $row, $cellContent); + $this->flushCell($column, $row, $cellContent); $column = 'A'; } break; case 'table': - $this->flushCell($sheet, $column, $row, $cellContent); + $this->flushCell($column, $row, $cellContent); $column = $this->setTableStartColumn($column); // echo 'START OF TABLE LEVEL ' , $this->tableLevel , '
'; if ($this->tableLevel > 1) { --$row; } - $this->processDomElement($child, $sheet, $row, $column, $cellContent); + $this->processDomElement($element, $row, $column, $cellContent); // echo 'END OF TABLE LEVEL ' , $this->tableLevel , '
'; $column = $this->releaseTableStartColumn(); if ($this->tableLevel > 1) { @@ -390,33 +380,33 @@ protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, break; case 'thead': case 'tbody': - $this->processDomElement($child, $sheet, $row, $column, $cellContent); + $this->processDomElement($element, $row, $column, $cellContent); break; case 'tr': $column = $this->getTableStartColumn(); $cellContent = ''; // echo 'START OF TABLE ' , $this->tableLevel , ' ROW
'; - $this->processDomElement($child, $sheet, $row, $column, $cellContent); + $this->processDomElement($element, $row, $column, $cellContent); ++$row; // echo 'END OF TABLE ' , $this->tableLevel , ' ROW
'; break; case 'th': case 'td': // echo 'START OF TABLE ' , $this->tableLevel , ' CELL
'; - $this->processDomElement($child, $sheet, $row, $column, $cellContent); + $this->processDomElement($element, $row, $column, $cellContent); // echo 'END OF TABLE ' , $this->tableLevel , ' CELL
'; while (isset($this->rowspan[$column . $row])) { ++$column; } - $this->flushCell($sheet, $column, $row, $cellContent); + $this->flushCell($column, $row, $cellContent); // if (isset($attributeArray['style']) && !empty($attributeArray['style'])) { // $styleAry = $this->getPhpExcelStyleArray($attributeArray['style']); // // if (!empty($styleAry)) { -// $sheet->getStyle($column . $row)->applyFromArray($styleAry); +// $this->sheet->getStyle($column . $row)->applyFromArray($styleAry); // } // } @@ -427,25 +417,25 @@ protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, ++$columnTo; } $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1); - foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) { + foreach (PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) { $this->rowspan[$value] = true; } - $sheet->mergeCells($range); + $this->sheet->mergeCells($range); $column = $columnTo; } elseif (isset($attributeArray['rowspan'])) { //create merging rowspan $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1); - foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) { + foreach (PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) { $this->rowspan[$value] = true; } - $sheet->mergeCells($range); + $this->sheet->mergeCells($range); } elseif (isset($attributeArray['colspan'])) { //create merging colspan $columnTo = $column; for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) { ++$columnTo; } - $sheet->mergeCells($column . $row . ':' . $columnTo . $row); + $this->sheet->mergeCells($column . $row . ':' . $columnTo . $row); $column = $columnTo; } ++$column; @@ -455,58 +445,12 @@ protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, $column = 'A'; $content = ''; $this->tableLevel = 0; - $this->processDomElement($child, $sheet, $row, $column, $cellContent); + $this->processDomElement($element, $row, $column, $cellContent); break; default: - $this->processDomElement($child, $sheet, $row, $column, $cellContent); + $this->processDomElement($element, $row, $column, $cellContent); } - } - } - } - - /** - * Loads PHPExcel from file into PHPExcel instance - * - * @param string $pFilename - * @param PHPExcel $objPHPExcel - * @return PHPExcel - * @throws PHPExcel_Reader_Exception - */ - public function loadIntoExisting($pFilename, PHPExcel $objPHPExcel) - { - // Open file to validate - $this->openFile($pFilename); - if (!$this->isValidFormat()) { - fclose($this->fileHandle); - throw new PHPExcel_Reader_Exception($pFilename . " is an Invalid HTML file."); - } - // Close after validating - fclose($this->fileHandle); - - // Create new PHPExcel - while ($objPHPExcel->getSheetCount() <= $this->sheetIndex) { - $objPHPExcel->createSheet(); - } - $objPHPExcel->setActiveSheetIndex($this->sheetIndex); - - // Create a new DOM object - $dom = new domDocument; - // Reload the HTML file into the DOM object - $loaded = $dom->loadHTML($this->securityScanFile($pFilename)); - if ($loaded === false) { - throw new PHPExcel_Reader_Exception('Failed to load ', $pFilename, ' as a DOM Document'); - } - - // Discard white space - $dom->preserveWhiteSpace = false; - - $row = 0; - $column = 'A'; - $content = ''; - $this->processDomElement($dom, $objPHPExcel->getActiveSheet(), $row, $column, $content); - - // Return - return $objPHPExcel; + // This method does all traversing itself, no TRAVERSE_CHILD hint needed. } /** @@ -532,18 +476,5 @@ public function setSheetIndex($pValue = 0) return $this; } - /** - * Scan theXML for use of ElementHandler where is lowercase element name. + * Explicit handlers must accept same arguments as defaultElementHandler. + * + * Other handlers exist which facilitate implementation specific behavior: + * + * flushCell - Write a cell value + * textElementHandler - Invoked for DOMText elements. + * loadHandler - Invoked before traversing the DOM. + * finishHandler - Invoked after traversing the DOM. + * + * Copyright (c) 2006 - 2015 PHPExcel + * Copyright (c) 2015 Wine Logistix GmbH + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @category PHPExcel + * @package PHPExcel_Reader_HTML + * @copyright Copyright (c) 2006 - 2015 PHPExcel (http://www.codeplex.com/PHPExcel) + * @copyright Copyright (c) 2015 Wine Logistix (http://www.wine-logistix.de) + * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt LGPL + * @version ##VERSION##, ##DATE## + */ +abstract class PHPExcel_Reader_HTML_Abstract extends PHPExcel_Reader_Abstract implements PHPExcel_Reader_IReader +{ + + /** + * Tell processDomElement to traverse child elements of the current child + * element recursively. + * @var int + */ + const TRAVERSE_CHILDS = 1; + + /** + * Write cell content at specified position to active sheet. + * @param string $column + * @param int $row + * @param string $cellContent + */ + protected abstract function flushCell($column, $row, &$cellContent); + + /** + * Handler for elements with no explicit handler. + * @param DOMNode $element + * @param int $row + * @param string $column + * @param string $cellContent + * @return int|null TRAVERSE_CHILDS or null + */ + protected abstract function defaultElementHandler(DOMNode $element, &$row, &$column, &$cellContent); + + /** + * Handler for DOMText elements. + * @param DOMNode $element + * @param int $row + * @param string $column + * @param string $cellContent + */ + protected abstract function textElementHandler(DOMNode $element, &$row, &$column, &$cellContent); + + /** + * Handler which is executed after loading the HTML file and before + * traversing elements. + * @param PHPExcel $objPHPExcel + */ + protected abstract function loadHandler(PHPExcel $objPHPExcel); + + /** + * Handler which is executed after traversing elements and before + * returning from load method. + */ + protected abstract function finishHandler(); + + /** + * Loads PHPExcel from file. + * @param string $pFilename + * @return PHPExcel + * @throws PHPExcel_Reader_Exception + */ + public function load($pFilename) + { + // Create new PHPExcel + $objPHPExcel = new PHPExcel(); + // Open file to validate + $this->openFile($pFilename); + if (!$this->isValidFileFormat()) { + fclose($this->fileHandle); + throw new PHPExcel_Reader_Exception($pFilename . " is an invalid HTML file."); + } + // Close after validating + fclose($this->fileHandle); + // Load into this instance + return $this->loadIntoExisting($pFilename, $objPHPExcel); + } + + /** + * Loads PHPExcel from string. + * @param string $content HTML content + * @return PHPExcel + * @throws PHPExcel_Reader_Exception + */ + public function loadFromString($content) + { + $objPHPExcel = new PHPExcel(); + if (!$this->isValidFormat($content)) { + throw new PHPExcel_Reader_Exception("HTML content is invalid"); + } + $html = $this->securityScan($content); + return $this->loadIntoExistingFromString($html, $objPHPExcel); + } + + /** + * Loads PHPExcel from file into PHPExcel instance. + * + * @param string $pFilename + * @param PHPExcel $objPHPExcel + * @return PHPExcel + * @throws PHPExcel_Reader_Exception + */ + public function loadIntoExisting($pFilename, PHPExcel $objPHPExcel) + { + $html = $this->securityScanFile($pFilename); + return $this->loadIntoExistingFromString($html, $objPHPExcel); + } + + /** + * Loads PHPExcel from string into PHPExcel instance. + */ + protected function loadIntoExistingFromString($content, PHPExcel $objPHPExcel) + { + // This method is protected as it doesn't do the security scan on content. + // Create a new DOM object + $dom = new DOMDocument(); + // Reload the HTML file into the DOM object + $loaded = $dom->loadHTML($content); + if ($loaded === false) { + throw new PHPExcel_Reader_Exception('Failed to load ', $pFilename, ' as a DOM Document'); + } + + // Discard white space + $dom->preserveWhiteSpace = false; + + $row = 1; + $column = 'A'; + $content = ''; + + // Allow implementation specific initalization after load. + $this->loadHandler($objPHPExcel); + + $this->processDomElement($dom, $row, $column, $content); + + // Allow implementation specific operation after processing. + $this->finishHandler(); + + // Return + return $objPHPExcel; + } + + /** + * Validate that data contains HTML. + * @return boolean + */ + protected function isValidFormat(&$data) + { + if ((strpos($data, '<') !== false) && + (strlen($data) !== strlen(strip_tags($data)))) { + return true; + } + return false; + } + + /** + * Validate that the current file is an HTML file + * + * @return boolean + */ + protected function isValidFileFormat() + { + // Reading 2048 bytes should be enough to validate that the format is HTML + $data = fread($this->fileHandle, 2048); + return $this->isValidFormat($data); + } + + /** + * Traverse elements in DOM and invoke handler. + * A handler method in own object with name ElementHandler + * is invoked if the method exists, or defaultElementHandler if not. + * Handlers can indicate whether to traverse child elements, by returning + * TRAVERSE_CHILDS. Childs are traversed recursively. + * @param DOMNode $element Element of which childs are traversed. + * @param int $row Row number + * @param string $column Excel style column name + * @param $cellContent A buffer which can be used by implementation to store temporary cell content before flushing to cell. + */ + protected function processDomElement(DOMNode $element, &$row, &$column, &$cellContent) + { + foreach ($element->childNodes as $child) { + if ($child instanceof DOMText) { + $this->textElementHandler($child, $row, $column, $cellContent); + } elseif ($child instanceof DOMElement) { + // For each element a handler is invoked dynamically. If you + // don't want to use dynamic dispatch, use defaultElementHandler. + $nodeName = $this->cleanNodeName($child->nodeName); + $handlerName = $nodeName . "ElementHandler"; + $continueWith = (method_exists($this, $handlerName) + ? $this->{$handlerName}($child, $row, $column, $cellContent) + : $this->defaultElementHandler($child, $row, $column, $cellContent)); + if ($continueWith === self::TRAVERSE_CHILDS && $child->hasChildNodes()) { + // Handlers may traverse the DOM themselves. To avoid + // unnecessary traversing in here, by default no childs of + // the child are traversed. If however indicated by handler + // to traverse childs, then do so. + $this->processDomElement($child, $row, $column, $cellContent); + } + } + } + } + + protected function cleanNodeName($elementName) + { + return strtolower(preg_replace('/[^a-zA-Z0-9]/u', '', $elementName)); + } + + /** + * Scan theXML for use of caption - worksheet title + * table > thead - Header rows (formatted bold) + * table > tbody - Data rows (no formatting) + * + * Copyright (c) 2015 Wine Logistix GmbH + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @category PHPExcel + * @package PHPExcel_Reader_HTML + * @copyright Copyright (c) 2015 Wine Logistix (http://www.wine-logistix.de) + * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt LGPL + * @version ##VERSION##, ##DATE## + */ +class PHPExcel_Reader_HTML_SemanticTable extends PHPExcel_Reader_HTML_Abstract +{ + + /** + * @var PHPExcel + */ + protected $excel; + + /** + * Write cell content at specified position to active sheet. + * @param int $row + * @param string $column + * @param string $cellContent + */ + protected function flushCell($column, $row, &$cellContent) + { + if (is_string($cellContent)) { + $cellContent = trim($cellContent); + if ($cellContent !== '') { + $this->excel->getActiveSheet()->setCellValue($column.$row, $cellContent); + } + } + } + + /** + * Handler for elements with no explicit handler. + * @param DOMNode $element + * @param int $row + * @param string $column + * @param string $cellContent + */ + protected function defaultElementHandler(DOMNode $element, &$row, &$column, &$cellContent) + { + // This implementation doesn't care about any element except the ones + // for which an explicit handler is defined. To get to these elements + // though, children of the other elements need to be traversed. + return PHPExcel_Reader_HTML_Abstract::TRAVERSE_CHILDS; + } + + /** + * Handler for DOMText elements. + * @param DOMNode $element + * @param int $row + * @param string $column + * @param string $cellContent + */ + protected function textElementHandler(DOMNode $element, &$row, &$column, &$cellContent) + { + } + + /** + * Handler which is executed after loading the HTML file and before + * traversing elements. + * @param PHPExcel $objPHPExcel + */ + protected function loadHandler(PHPExcel $objPHPExcel) + { + $this->excel = $objPHPExcel; + // Remove first sheet because if no table elements are occured + // in document, then it's an error in the source file. + $this->excel->removeSheetByIndex(0); + } + + protected function finishHandler() + { + if ($this->excel->getSheetCount() > 0) { + // This is cosmetic; during processing a worksheet was created + // for each table and the last created is set active. When opening + // the file in GUI, the last worksheet would open, but it's most + // likely desired to view the first worksheet first. + $this->excel->setActiveSheetIndex(0); + } + } + + /** + * Set document title. + * @param DOMNode $element + * @param int $row + * @param string $column + * @param string $cellContent + */ + protected function titleElementHandler(DOMNode $element, &$row, &$column, &$cellContent) + { + $this->excel->getProperties()->setTitle($element->textContent); + } + + /** + * Create a new worksheet and use it as active sheet. + * @param DOMNode $element + * @param int $row + * @param string $column + * @param string $cellContent + */ + protected function tableElementHandler(DOMNode $element, &$row, &$column, &$cellContent) + { + $sheetNum = $this->excel->getSheetCount(); + $this->excel->createSheet(); + $this->excel->setActiveSheetIndex($sheetNum); + // Row and column need to be reset. + $row = 1; + $column = 'A'; + return PHPExcel_Reader_HTML_Abstract::TRAVERSE_CHILDS; + } + + /** + * Set title of current active sheet. + * @param DOMNode $element + * @param int $row + * @param string $column + * @param string $cellContent + */ + protected function captionElementHandler(DOMNode $element, &$row, &$column, &$cellContent) + { + $this->excel->getActiveSheet()->setTitle($element->textContent); + } + + /** + * For each header row in thead, create a row with bold formatted columns. + * @param DOMNode $element + * @param int $row + * @param string $column + * @param string $cellContent + */ + protected function theadElementHandler(DOMNode $element, &$row, &$column, &$cellContent) + { + foreach ($element->childNodes as $child) { + if ($this->isElement($child, "tr")) { + $this->createHeaderRow($child, $row); + $row += 1; + } + } + // Don't traverse childs as they are already traversed in here. + } + + protected function tbodyElementHandler(DOMNode $element, &$row, &$column, &$cellContent) + { + foreach ($element->childNodes as $child) { + if ($this->isElement($child, "tr")) { + $this->createDataRow($child, $row); + $row += 1; + } + } + // Don't traverse childs as they are already traversed in here. + } + + protected function createHeaderRow(DOMNode $theadRow, $row) + { + $column = 'A'; + foreach ($theadRow->childNodes as $child) { + if ($this->isElement($child, "th")) { + $this->flushCell($column, $row, $child->textContent); + $column++; + } + } + // Formatting headers by using range is faster than doing it in the loop. + $range = sprintf('A%d:%s%d', $row, $column, $row); + $this->excel->getActiveSheet()->getStyle($range)->getFont()->setBold(true); + } + + protected function createDataRow(DOMNode $tbodyRow, $row) + { + $column = 'A'; + foreach ($tbodyRow->childNodes as $child) { + if ($this->isElement($child, "td")) { + $this->flushCell($column, $row, $child->textContent); + $column++; + } + } + } + + private function isElement($el, $name) { + return $el instanceof DOMNode && $el->nodeName === $name; + } + +}