SetaPDF Demos

Interactive GUI

This demos shows you how to filter the result by a specific area on a PDF page. Just draw a rectangle above the image and the component will extract the text in this area from the PDF page.

MuPDF is used to generate an image of the PDF and JCrop is layed over this image to select an area. Afterwards the coordinates of the selected area are normalized to the coordinates inside the original PDF. The SetaPDF-Extractor simply uses these coordinates in an Rectangle Filter and returns the extracted text at this location.

PHP
<?php

// load and register the autoload function
require_once __DIR__ . '/../../../../../bootstrap.php';

$files = [
    'Laboratory-Report.pdf' => $assetsDirectory . '/pdfs/tektown/Laboratory-Report.pdf',
    'Fact-Sheet.pdf' => $assetsDirectory . '/pdfs/tektown/Fact-Sheet.pdf',
    'Terms-and-Conditions.pdf' => $assetsDirectory . '/pdfs/camtown/Terms-and-Conditions.pdf',
];
$dpi = 72;

if (isset($_GET['action']) && $_GET['action'] === 'preview') {
    // download the pdf file
    if (!array_key_exists($_GET['file'], $files)) {
        throw new Exception('Invalid file!');
    }
    $file = $files[$_GET['file']];

    header('Content-Type: application/pdf');
    header('Content-Disposition: inline; preview.pdf');
    header('Expires: 0');
    header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
    header('Pragma: public');
    header('Accept-Ranges: none');
    $content = file_get_contents($file);
    header('Content-Length: ' . strlen($content));
    echo $content;
    return;

} elseif (isset($_GET['action']) && $_GET['action'] === 'generateImagePreview') {
    // generate the preview image of the pdf
    if (!array_key_exists($_GET['file'], $files)) {
        throw new Exception('Invalid file!');
    }
    $file = $files[$_GET['file']];
    $pageNo = isset($_GET['page']) ? $_GET['page'] : 1;
    $imageFile = 'images/' . basename($file, '.pdf') . '-' . $dpi . '-PAGE.png';
    $realImageFile = str_replace('PAGE', $pageNo, $imageFile);


    if (!file_exists($realImageFile)) {
        $cmd = 'mutool draw -F png -r ' . escapeshellarg($dpi) . ' -o ' . str_replace('PAGE', '%d', escapeshellarg($imageFile))
            . ' ' . escapeshellarg($file) . ' ' . escapeshellarg($pageNo);

        exec($cmd, $output, $resultCode);

        if ($resultCode !== 0) {
            echo 'Thumbnail could not be generated. Please make sure that ' .
                '<a href="https://www.mupdf.com/docs/manual-mutool-draw.html" target="_blank">mutool</a> is installed ' .
                'and that the images/ folder is writable.';
            die();
        }
    }

    header('Content-Type: image/png');
    header('Content-Disposition: inline; image.png');
    header('Expires: 0');
    header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
    header('Pragma: public');
    header('Accept-Ranges: none');
    $content = file_get_contents($realImageFile);
    header('Content-Length: ' . strlen($content));
    echo $content;
    return;

} elseif (isset($_GET['action']) && $_GET['action'] === 'fetchPageCountAndFormats') {
    // fetch the page count and the page size
    if (!array_key_exists($_GET['file'], $files)) {
        throw new Exception('Invalid file!');
    }
    $file = $files[$_GET['file']];

    $document = SetaPDF_Core_Document::loadByFilename($file);
    $pages = $document->getCatalog()->getPages();
    $pageCount = $pages->count();
    $pageFormats = [];
    for ($i = 1; $i <= $pageCount; $i++) {
        $page = $pages->getPage($i);
        list($width, $height) = $page->getWidthAndHeight();
        $pageFormats[] = [$width, $height];
    }
    if ($pageCount === 0) {
        throw new Exception('PDF is empty');
    }

    header('Content-Type: application/json');
    header('Expires: 0');
    header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
    header('Pragma: public');
    header('Accept-Ranges: none');
    $content = json_encode([
        'pageCount' => $pageCount,
        'pageFormats' => $pageFormats,
    ]);
    header('Content-Length: ' . strlen($content));
    echo $content;
    return;

} elseif (isset($_GET['action']) && $_GET['action'] === 'extract') {
    // extract text by selected locations
    if (!array_key_exists($_GET['file'], $files)) {
        throw new Exception('Invalid file!');
    }
    $file = $files[$_GET['file']];

    $page = $_GET['page'];
    // upper left point
    $x1 = $_GET['data']['x1'];
    $y1 = $_GET['data']['y1'];
    // lower right point
    $x2 = $_GET['data']['x2'];
    $y2 = $_GET['data']['y2'];

    // load the document
    $document = SetaPDF_Core_Document::loadByFilename($file);

    // get access to its pages
    $pages = $document->getCatalog()->getPages();

    // the interresting part: initiate an extractor instance
    $extractor = new SetaPDF_Extractor($document);

    // create a word strategy instance
    $strategy = new SetaPDF_Extractor_Strategy_ExactPlain();
    // pass a rectangle filter to the strategy
    $strategy->setFilter(new SetaPDF_Extractor_Filter_Rectangle(
        new SetaPDF_Core_Geometry_Rectangle($x1, $y1, $x2, $y2),
        SetaPDF_Extractor_Filter_Rectangle::MODE_CONTACT
    ));
    $extractor->setStrategy($strategy);

    // get the text of a page
    $result = $extractor->getResultByPageNumber($page);

    header('Content-Type: application/json');
    header('Expires: 0');
    header('Cache-Control: must-revalidate, post-check=0, pre-check=0');
    header('Pragma: public');
    header('Accept-Ranges: none');
    $content = json_encode([
        'result' => htmlspecialchars($result),
    ]);
    header('Content-Length: ' . strlen($content));
    echo $content;
    return;
} else {
    $filePath = displayFiles($files);
    $file = array_search($filePath, $files);
    if ($file === false) {
        throw new Exception('Invalid file selected');
    }
    require './gui.php';
}
PHP
<?php
if (!isset($dpi, $file, $basePath)) {
    die();
}
$script = $_SERVER['SCRIPT_NAME'];
?>
<html lang="en">
<head>
    <script src="https://cdn.jsdelivr.net/jquery/2.2.4/jquery.min.js"
            integrity="sha256-BbhdlvQf/xTY9gja0Dq3HiwQF8LaCRTXxZKRutelT44="
            crossorigin="anonymous"
    ></script>
    <script type="text/javascript"
            src="https://cdn.jsdelivr.net/jquery.blockui/2.70.0/jquery.blockUI.min.js"
            integrity="sha256-9wSYpoBdTOlj3azv4n74Mlb984+xKfTS7dhcYRqSqMA="
            crossorigin="anonymous"
    ></script>

    <link rel="stylesheet"
          type="text/css"
          href="https://cdn.jsdelivr.net/jquery.jcrop/0.9.12/css/jquery.Jcrop.min.css"
          integrity="sha256-/fCoT6hQHsrj1J/wn7oNqgWmtm9alQ2QRwWm2B0Fo1o="
          crossorigin="anonymous"/>
    <script src="https://cdn.jsdelivr.net/jquery.jcrop/0.9.12/js/jquery.Jcrop.min.js"
            integrity="sha256-ZxCBLDyBkvv5I47GMz1THCbcQ00JR0BvWlqWUEXupKI="
            crossorigin="anonymous"
    ></script>

</head>
<body>
<table>
    <tr>
        <td>
            <fieldset class="pageCount" style="border: 0;"></fieldset>
            <div class="imageContainer" style="border: 1px solid #d3d3d3;"></div>
        </td>
        <td style="vertical-align: top; padding: 5px;">
            <div class="extractedText"></div>
        </td>
    </tr>
</table>

<script type="text/javascript">
    $(function() {
        var actualPage = 1,
            isLoading = false,
            dpi = <?=$dpi?>,
            file = '<?=$file?>',
            pageFormats, jcrop;

        $.blockUI.defaults.message = '<img src="<?=$basePath?>layout/img/ajax-loader-big.gif" />';
        $.extend($.blockUI.defaults.css, {
            backgroundColor: 'transparent',
            border: 'none',
            color: '#fff'
        });

        var initImage = function () {
            if (jcrop) {
                jcrop.destroy();
            }

            $('div.extractedText').empty();
            $('.imageContainer').empty()
                // note: this controller generates an image of the pdf + page number
                .html('<img class="demoImage" src="<?=$script?>' + '?action=generateImagePreview&file=' + file + '&page=' + actualPage + '"/>');

            $('img.demoImage')
                .load(function () {
                    $('.demoImage').Jcrop({
                        onSelect: function(c) {
                            if (isLoading) {
                                return;
                            }

                            var height = pageFormats[actualPage - 1][1];
                            var dpiFactor = 1/72 * dpi;
                            c.y = height - c.y / dpiFactor;
                            c.y2 = height - c.y2 / dpiFactor;
                            c.x = c.x / dpiFactor;
                            c.x2 = c.x2 / dpiFactor;

                            $.blockUI();
                            $.ajax({
                                url : '<?=$script?>',
                                type : 'GET',
                                cache : false,
                                data: 'action=extract&file=' + file + '&page=' + actualPage + '&data[x1]=' + c.x + '&data[y1]=' + c.y + '&data[x2]=' + c.x2 + '&data[y2]=' + c.y2,
                            }).done(function(result) {
                                try {
                                    var extractedText = $('div.extractedText');
                                    extractedText.empty();

                                    extractedText.html('<h3>Script Output:</h3><pre>' + result.result + '</pre>');
                                } catch(error) {
                                    console.error(error);
                                }

                                $.unblockUI();
                            }).fail(function(error) {
                                console.error(error.responseText);
                                $.unblockUI();
                            });
                        },
                        onRelease: function() {
                            $('a[href="#code"]').addClass('disabled');

                            var extractedText = $('div.extractedText');
                            extractedText.empty();
                        }
                    }, function () {
                        jcrop = this;
                    });
                });
        };

        if (isLoading) {
            return;
        }
        isLoading = true;

        actualPage = 1;
        $('div.extractedText').empty();
        $.blockUI();
        $.ajax({
            url: '<?=$script?>',
            type: 'GET',
            cache: false,
            data: 'action=fetchPageCountAndFormats&file=' + file,
        }).done(function(result) {
            isLoading = false;
            $.unblockUI();

            var fieldset = $('fieldset.pageCount');
            fieldset.empty();

            pageFormats = result.pageFormats;
            var pageNumberSelect = '<label for="pageNumber" style="margin-right: 5px;">Page number:</label><select name="data[page]" id="pageNumber">';
            for (var i = 1; i <= result.pageCount; i++) {
                pageNumberSelect += '<option value="' + i + '"'+ (i == actualPage ? ' selected="selected"' : '') +'>'
                    + i + '</option>';
            }
            pageNumberSelect += '</select>';

            fieldset.html(pageNumberSelect);

            $('select#pageNumber', fieldset).change(function() {
                actualPage = $(this).val();
                initImage();
            });

            initImage();
        }).fail(function(error) {
            isLoading = false;
            console.error(error.responseText);
            $.unblockUI();
        });
    });
</script>
</body>
</html>