SetaPDF Demos

Check for Text

This demo shows a simple content stream parser which will catch text output operators.

Notice that you will not have direct access to the text in that demo. You should check out the demos of the SetaPDF-Extractor component for this task.

 script.php
 TextProcessor.php
 Run

PHP

copy

<?php

use com\setasign\SetaPDF\Demos\ContentStreamProcessor\TextProcessor;

// load and register the autoload function
require_once '../../../../../bootstrap.php';

// prepare some files
$files = [
    $assetsDirectory . '/pdfs/Brand-Guide.pdf',
    $assetsDirectory . '/pdfs/Fact-Sheet-form.pdf',
    $assetsDirectory . '/pdfs/lenstown/Laboratory-Report.pdf',
];
$files = array_merge($files, glob($assetsDirectory . '/pdfs/misc/*.pdf'));

$path = displayFiles($files);

// require the text processor class
require_once $classesDirectory . '/ContentStreamProcessor/TextProcessor.php';

// load a document instance
$document = \SetaPDF_Core_Document::loadByFilename($path);
// get access to the pages object
$pages = $document->getCatalog()->getPages();

// walk through the pages
for ($pageNo = 1, $pageCount = $pages->count(); $pageNo <= $pageCount; $pageNo++) {
    $canvas = $pages->getPage($pageNo)->getCanvas();

    // create an text processor instance
    $processor = new TextProcessor($canvas);

    // check for text
    if ($processor->hasText()) {
        echo 'Page ' . $pageNo . ' has text!';
    } else {
        echo 'Page ' . $pageNo . ' has NO text!';
    }

    echo '</br>';
}

PHP

copy

<?php

namespace com\setasign\SetaPDF\Demos\ContentStreamProcessor;

/**
 * Class TextProcessor
 */
class TextProcessor
{
    /**
     * The canvas object
     *
     * @var \SetaPDF_Core_Canvas
     */
    protected $_canvas;

    /**
     * @var boolean
     */
    protected $_hasText;

    /**
     * The constructor
     *
     * The parameter is the canvas instance.
     *
     * @param \SetaPDF_Core_Canvas $canvas
     */
    public function __construct(\SetaPDF_Core_Canvas $canvas)
    {
        $this->_canvas = $canvas;
    }

    /**
     * Checks for text on the initially passed canvas instance.
     *
     * Returns true if there is any text in the stream, otherwise false
     *
     * @return bool
     */
    public function hasText()
    {
        // if there are no resources no text can be output because no font is defined
        $resources = $this->_canvas->getResources();
        if ($resources === false) {
            return false;
        }

        $this->_hasText = false;

        $parser = $this->_createContentParser();
        $parser->process();
        $parser->cleanUp();

        return $this->_hasText;
    }

    /**
     * Create a content parser instance.
     *
     * @return \SetaPDF_Core_Parser_Content
     */
    protected function _createContentParser()
    {
        try {
            $stream = $this->_canvas->getStream();
        } catch (\SetaPDF_Core_Filter_Exception $e) {
            // if a stream cannot be unfiltered, we ignore it
            $stream = '';
        }

        $contentParser = new \SetaPDF_Core_Parser_Content($stream);

        // register a callback for text output operators
        $contentParser->registerOperator(
            ['Tj', 'TJ', '"', "'"],
            function () {
                $this->_hasText = true;
                return false;
            }
        );

        // register a callback to handle form XObjects
        $contentParser->registerOperator(
            'Do',
            function ($arguments) {
                $xObjects = $this->_canvas->getResources(true, false, \SetaPDF_Core_Resource::TYPE_X_OBJECT);
                if ($xObjects === false) {
                    return;
                }

                $xObject = $xObjects->getValue($arguments[0]->getValue());
                $xObject = \SetaPDF_Core_XObject::get($xObject);

                if ($xObject instanceof \SetaPDF_Core_XObject_Form) {
                    $processor = new self($xObject->getCanvas());

                    $this->_hasText = $processor->hasText();
                    if ($this->_hasText === true) {
                        return false;
                    }
                }
            }
        );

        return $contentParser;
    }
}

 script.php
 TextProcessor.php
 Run

Check for Transparency

Check for PDF/A information