SetaPDF Demos

There seems to be a problem loading the components. Please check your PHP error logs for details!

Common issues could be that you missed to install the trial license or that you are using a trial version on an unsupported PHP version.

Interactive GUI

This demos shows you how to filter the result by a specific area on a PDF page. Just draw a rectangle above the image and the component will extract the text in this area from the PDF page.

MuPDF is used to generate an image of the PDF and JCrop is layed over this image to select an area. Afterwards the coordinates of the selected area are normalized to the coordinates inside the original PDF. The SetaPDF-Extractor simply uses these coordinates in an Rectangle Filter and returns the extracted text at this location.

PHP
<?php

// load and register the autoload function
require_once __DIR__ . '/../../../../../bootstrap.php';

$files = [
    'Laboratory-Report.pdf' => $assetsDirectory . '/pdfs/tektown/Laboratory-Report.pdf',
    'Fact-Sheet.pdf' => $assetsDirectory . '/pdfs/tektown/Fact-Sheet.pdf',
    'Terms-and-Conditions.pdf' => $assetsDirectory . '/pdfs/camtown/Terms-and-Conditions.pdf',
];
$dpi = 72;

if (isset($_GET['action']) && $_GET['action'] === 'preview') {
    // download the pdf file
    if (!array_key_exists($_GET['file'], $files)) {
        throw new Exception('Invalid file!');
    }
    $file = $files[$_GET['file']];

    header('Content-Type: application/pdf');
    header('Content-Disposition: inline; preview.pdf');
    header('Expires: 0');
    header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
    header('Pragma: public');
    header('Accept-Ranges: none');
    $content = file_get_contents($file);
    header('Content-Length: ' . strlen($content));
    echo $content;
    return;

} elseif (isset($_GET['action']) && $_GET['action'] === 'generateImagePreview') {
    // generate the preview image of the pdf
    if (!array_key_exists($_GET['file'], $files)) {
        throw new Exception('Invalid file!');
    }
    $file = $files[$_GET['file']];
    $pageNo = isset($_GET['page']) ? $_GET['page'] : 1;
    $imageFile = 'images/' . basename($file, '.pdf') . '-' . $dpi . '-PAGE.png';
    $realImageFile = str_replace('PAGE', $pageNo, $imageFile);


    if (!file_exists($realImageFile)) {
        $cmd = 'mutool draw -F png -r ' . escapeshellarg($dpi) . ' -o ' . str_replace('PAGE', '%d', escapeshellarg($imageFile))
            . ' ' . escapeshellarg($file) . ' ' . escapeshellarg($pageNo);

        exec($cmd, $output, $resultCode);

        if ($resultCode !== 0) {
            echo 'Thumbnail could not be generated. Please make sure that ' .
                '<a href="https://www.mupdf.com/docs/manual-mutool-draw.html" target="_blank">mutool</a> is installed ' .
                'and that the images/ folder is writable.';
            die();
        }
    }

    header('Content-Type: image/png');
    header('Content-Disposition: inline; image.png');
    header('Expires: 0');
    header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
    header('Pragma: public');
    header('Accept-Ranges: none');
    $content = file_get_contents($realImageFile);
    header('Content-Length: ' . strlen($content));
    echo $content;
    return;

} elseif (isset($_GET['action']) && $_GET['action'] === 'fetchPageCountAndFormats') {
    // fetch the page count and the page size
    if (!array_key_exists($_GET['file'], $files)) {
        throw new Exception('Invalid file!');
    }
    $file = $files[$_GET['file']];

    $document = \SetaPDF_Core_Document::loadByFilename($file);
    $pages = $document->getCatalog()->getPages();
    $pageCount = $pages->count();
    $pageFormats = [];
    for ($i = 1; $i <= $pageCount; $i++) {
        $page = $pages->getPage($i);
        list($width, $height) = $page->getWidthAndHeight();
        $pageFormats[] = [$width, $height];
    }
    if ($pageCount === 0) {
        throw new Exception('PDF is empty');
    }

    header('Content-Type: application/json');
    header('Expires: 0');
    header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
    header('Pragma: public');
    header('Accept-Ranges: none');
    $content = json_encode([
        'pageCount' => $pageCount,
        'pageFormats' => $pageFormats,
    ]);
    header('Content-Length: ' . strlen($content));
    echo $content;
    return;

} elseif (isset($_GET['action']) && $_GET['action'] === 'extract') {
    // extract text by selected locations
    if (!array_key_exists($_GET['file'], $files)) {
        throw new Exception('Invalid file!');
    }
    $file = $files[$_GET['file']];

    $page = $_GET['page'];
    // upper left point
    $x1 = $_GET['data']['x1'];
    $y1 = $_GET['data']['y1'];
    // lower right point
    $x2 = $_GET['data']['x2'];
    $y2 = $_GET['data']['y2'];

    // load the document
    $document = \SetaPDF_Core_Document::loadByFilename($file);

    // get access to its pages
    $pages = $document->getCatalog()->getPages();

    // the interresting part: initiate an extractor instance
    $extractor = new \SetaPDF_Extractor($document);

    // create a word strategy instance
    $strategy = new \SetaPDF_Extractor_Strategy_ExactPlain();
    // pass a rectangle filter to the strategy
    $strategy->setFilter(new \SetaPDF_Extractor_Filter_Rectangle(
        new \SetaPDF_Core_Geometry_Rectangle($x1, $y1, $x2, $y2),
        \SetaPDF_Extractor_Filter_Rectangle::MODE_CONTACT
    ));
    $extractor->setStrategy($strategy);

    // get the text of a page
    $result = $extractor->getResultByPageNumber($page);

    header('Content-Type: application/json');
    header('Expires: 0');
    header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
    header('Pragma: public');
    header('Accept-Ranges: none');
    $content = json_encode([
        'result' => htmlspecialchars($result),
    ]);
    header('Content-Length: ' . strlen($content));
    echo $content;
    return;
} else {
    $filePath = displayFiles($files);
    $file = array_search($filePath, $files);
    if ($file === false) {
        throw new Exception('Invalid file selected');
    }
    require './gui.php';
}
PHP
<?php
if (!isset($dpi, $file, $basePath)) {
    die();
}
$script = $_SERVER['SCRIPT_NAME'];
?>
<html lang="en">
<head>
    <script src="https://cdn.jsdelivr.net/jquery/2.2.4/jquery.min.js"
            integrity="sha256-BbhdlvQf/xTY9gja0Dq3HiwQF8LaCRTXxZKRutelT44="
            crossorigin="anonymous"
    ></script>
    <script type="text/javascript"
            src="https://cdn.jsdelivr.net/jquery.blockui/2.70.0/jquery.blockUI.min.js"
            integrity="sha256-9wSYpoBdTOlj3azv4n74Mlb984+xKfTS7dhcYRqSqMA="
            crossorigin="anonymous"
    ></script>

    <link rel="stylesheet"
          type="text/css"
          href="https://cdn.jsdelivr.net/jquery.jcrop/0.9.12/css/jquery.Jcrop.min.css"
          integrity="sha256-/fCoT6hQHsrj1J/wn7oNqgWmtm9alQ2QRwWm2B0Fo1o="
          crossorigin="anonymous"/>
    <script src="https://cdn.jsdelivr.net/jquery.jcrop/0.9.12/js/jquery.Jcrop.min.js"
            integrity="sha256-ZxCBLDyBkvv5I47GMz1THCbcQ00JR0BvWlqWUEXupKI="
            crossorigin="anonymous"
    ></script>

</head>
<body>
<table>
    <tr>
        <td>
            <fieldset class="pageCount" style="border: 0;"></fieldset>
            <div class="imageContainer" style="border: 1px solid #d3d3d3;"></div>
        </td>
        <td style="vertical-align: top; padding: 5px;">
            <div class="extractedText"></div>
        </td>
    </tr>
</table>

<script type="text/javascript">
    $(function() {
        var actualPage = 1,
            isLoading = false,
            dpi = <?=$dpi?>,
            file = '<?=$file?>',
            pageFormats, jcrop;

        $.blockUI.defaults.message = '<img src="<?=$basePath?>layout/img/ajax-loader-big.gif" />';
        $.extend($.blockUI.defaults.css, {
            backgroundColor: 'transparent',
            border: 'none',
            color: '#fff'
        });

        var initImage = function () {
            if (jcrop) {
                jcrop.destroy();
            }

            $('div.extractedText').empty();
            $('.imageContainer').empty()
                // note: this controller generates an image of the pdf + page number
                .html('<img class="demoImage" src="<?=$script?>' + '?action=generateImagePreview&file=' + file + '&page=' + actualPage + '"/>');

            $('img.demoImage')
                .load(function () {
                    $('.demoImage').Jcrop({
                        onSelect: function(c) {
                            if (isLoading) {
                                return;
                            }

                            var height = pageFormats[actualPage - 1][1];
                            var dpiFactor = 1/72 * dpi;
                            c.y = height - c.y / dpiFactor;
                            c.y2 = height - c.y2 / dpiFactor;
                            c.x = c.x / dpiFactor;
                            c.x2 = c.x2 / dpiFactor;

                            $.blockUI();
                            $.ajax({
                                url : '<?=$script?>',
                                type : 'GET',
                                cache : false,
                                data: 'action=extract&file=' + file + '&page=' + actualPage + '&data[x1]=' + c.x + '&data[y1]=' + c.y + '&data[x2]=' + c.x2 + '&data[y2]=' + c.y2,
                            }).done(function(result) {
                                try {
                                    var extractedText = $('div.extractedText');
                                    extractedText.empty();

                                    extractedText.html('<h3>Script Output:</h3><pre>' + result.result + '</pre>');
                                } catch(error) {
                                    console.error(error);
                                }

                                $.unblockUI();
                            }).fail(function(error) {
                                console.error(error.responseText);
                                $.unblockUI();
                            });
                        },
                        onRelease: function() {
                            $('a[href="#code"]').addClass('disabled');

                            var extractedText = $('div.extractedText');
                            extractedText.empty();
                        }
                    }, function () {
                        jcrop = this;
                    });
                });
        };

        if (isLoading) {
            return;
        }
        isLoading = true;

        actualPage = 1;
        $('div.extractedText').empty();
        $.blockUI();
        $.ajax({
            url: '<?=$script?>',
            type: 'GET',
            cache: false,
            data: 'action=fetchPageCountAndFormats&file=' + file,
        }).done(function(result) {
            isLoading = false;
            $.unblockUI();

            var fieldset = $('fieldset.pageCount');
            fieldset.empty();

            pageFormats = result.pageFormats;
            var pageNumberSelect = '<label for="pageNumber" style="margin-right: 5px;">Page number:</label><select name="data[page]" id="pageNumber">';
            for (var i = 1; i <= result.pageCount; i++) {
                pageNumberSelect += '<option value="' + i + '"'+ (i == actualPage ? ' selected="selected"' : '') +'>'
                    + i + '</option>';
            }
            pageNumberSelect += '</select>';

            fieldset.html(pageNumberSelect);

            $('select#pageNumber', fieldset).change(function() {
                actualPage = $(this).val();
                initImage();
            });

            initImage();
        }).fail(function(error) {
            isLoading = false;
            console.error(error.responseText);
            $.unblockUI();
        });
    });
</script>
</body>
</html>