Check for Text
This demo shows a simple content stream parser which will catch text output operators.
Notice that you will not have direct access to the text in that demo. You should check out the demos of the SetaPDF-Extractor component for this task.
PHP
<?php
use setasign\SetaPDF2\Demos\ContentStreamProcessor\TextProcessor;
use setasign\SetaPDF2\Core\Document;
// load and register the autoload function
require_once '../../../../../bootstrap.php';
// prepare some files
$files = [
$assetsDirectory . '/pdfs/Brand-Guide.pdf',
$assetsDirectory . '/pdfs/Fact-Sheet-form.pdf',
$assetsDirectory . '/pdfs/lenstown/Laboratory-Report.pdf',
];
$files = array_merge($files, glob($assetsDirectory . '/pdfs/misc/*.pdf'));
$path = displayFiles($files);
// require the text processor class
require_once $classesDirectory . '/ContentStreamProcessor/TextProcessor.php';
// load a document instance
$document = Document::loadByFilename($path);
// get access to the pages object
$pages = $document->getCatalog()->getPages();
// walk through the pages
for ($pageNo = 1, $pageCount = $pages->count(); $pageNo <= $pageCount; $pageNo++) {
$canvas = $pages->getPage($pageNo)->getCanvas();
// create a text processor instance
$processor = new TextProcessor($canvas);
// check for text
if ($processor->hasText()) {
echo 'Page ' . $pageNo . ' has text!';
} else {
echo 'Page ' . $pageNo . ' has NO text!';
}
echo '</br>';
}
PHP
<?php
namespace setasign\SetaPDF2\Demos\ContentStreamProcessor;
use setasign\SetaPDF2\Core\Canvas\Canvas;
use setasign\SetaPDF2\Core\Filter\Exception as FilterException;
use setasign\SetaPDF2\Core\Parser\Content;
use setasign\SetaPDF2\Core\Resource\ResourceInterface;
use setasign\SetaPDF2\Core\XObject\XObject;
use setasign\SetaPDF2\Core\XObject\Form;
/**
* Class TextProcessor
*/
class TextProcessor
{
/**
* The canvas object
*
* @var Canvas
*/
protected $_canvas;
/**
* @var array All object ids of visited XObjects to prevent circular references
*/
protected $_xObjectObjectIds = [];
/**
* @var bool
*/
protected $_hasText;
/**
* The constructor
*
* The parameter is the canvas instance.
*
* @param Canvas $canvas
*/
public function __construct(Canvas $canvas)
{
$this->_canvas = $canvas;
}
/**
* Checks for text on the initially passed canvas instance.
*
* Returns true if there is any text in the stream, otherwise false
*
* @return bool
*/
public function hasText(): bool
{
// if there are no resources no text can be output because no font is defined
$resources = $this->_canvas->getResources();
if ($resources === false) {
return false;
}
$this->_hasText = false;
$parser = $this->_createContentParser();
$parser->process();
$parser->cleanUp();
return $this->_hasText;
}
/**
* Create a content parser instance.
*
* @return Content
*/
protected function _createContentParser()
{
try {
$stream = $this->_canvas->getStream();
} catch (FilterException $e) {
// if a stream cannot be unfiltered, we ignore it
$stream = '';
}
$contentParser = new Content($stream);
// register a callback for text output operators
$contentParser->registerOperator(
['Tj', 'TJ', '"', "'"],
function () {
$this->_hasText = true;
return false;
}
);
// register a callback to handle form XObjects
$contentParser->registerOperator(
'Do',
function ($arguments) {
$xObjects = $this->_canvas->getResources(true, false, ResourceInterface::TYPE_X_OBJECT);
if ($xObjects === false) {
return;
}
$xObject = $xObjects->getValue($arguments[0]->getValue());
$xObject = XObject::get($xObject);
if ($xObject instanceof Form) {
if (isset($this->_xObjectObjectIds[$xObject->getIndirectObject()->getObjectId()])) {
// recursion
return;
}
$this->_xObjectObjectIds[$xObject->getIndirectObject()->getObjectId()] = true;
$processor = new self($xObject->getCanvas());
$processor->_xObjectObjectIds =& $this->_xObjectObjectIds;
$this->_hasText = $processor->hasText();
unset($this->_xObjectObjectIds[$xObject->getIndirectObject()->getObjectId()]);
if ($this->_hasText === true) {
return false;
}
}
}
);
return $contentParser;
}
}