Interactive GUI
This demos shows you how to filter the result by a specific area on a PDF page. Just draw a rectangle above the image and the component will extract the text in this area from the PDF page.
MuPDF is used to generate an image of the PDF and JCrop is layed over this image to select an area. Afterwards the coordinates of the selected area are normalized to the coordinates inside the original PDF. The SetaPDF-Extractor simply uses these coordinates in an Rectangle Filter and returns the extracted text at this location.
<?php // load and register the autoload function require_once __DIR__ . '/../../../../../bootstrap.php'; $files = [ 'Laboratory-Report.pdf' => $assetsDirectory . '/pdfs/tektown/Laboratory-Report.pdf', 'Fact-Sheet.pdf' => $assetsDirectory . '/pdfs/tektown/Fact-Sheet.pdf', 'Terms-and-Conditions.pdf' => $assetsDirectory . '/pdfs/camtown/Terms-and-Conditions.pdf', ]; $dpi = 72; if (isset($_GET['action']) && $_GET['action'] === 'preview') { // download the pdf file if (!array_key_exists($_GET['file'], $files)) { throw new Exception('Invalid file!'); } $file = $files[$_GET['file']]; header('Content-Type: application/pdf'); header('Content-Disposition: inline; preview.pdf'); header('Expires: 0'); header('Cache-Control: must-revalidate, post-check=0, pre-check=0'); header('Pragma: public'); header('Accept-Ranges: none'); $content = file_get_contents($file); header('Content-Length: ' . strlen($content)); echo $content; return; } elseif (isset($_GET['action']) && $_GET['action'] === 'generateImagePreview') { // generate the preview image of the pdf if (!array_key_exists($_GET['file'], $files)) { throw new Exception('Invalid file!'); } $file = $files[$_GET['file']]; $pageNo = isset($_GET['page']) ? $_GET['page'] : 1; $imageFile = 'images/' . basename($file, '.pdf') . '-' . $dpi . '-PAGE.png'; $realImageFile = str_replace('PAGE', $pageNo, $imageFile); if (!file_exists($realImageFile)) { $cmd = 'mutool draw -F png -r ' . escapeshellarg($dpi) . ' -o ' . str_replace('PAGE', '%d', escapeshellarg($imageFile)) . ' ' . escapeshellarg($file) . ' ' . escapeshellarg($pageNo); exec($cmd, $output, $resultCode); if ($resultCode !== 0) { echo 'Thumbnail could not be generated. Please make sure that ' . '<a href="https://www.mupdf.com/docs/manual-mutool-draw.html" target="_blank">mutool</a> is installed ' . 'and that the images/ folder is writable.'; die(); } } header('Content-Type: image/png'); header('Content-Disposition: inline; image.png'); header('Expires: 0'); header('Cache-Control: must-revalidate, post-check=0, pre-check=0'); header('Pragma: public'); header('Accept-Ranges: none'); $content = file_get_contents($realImageFile); header('Content-Length: ' . strlen($content)); echo $content; return; } elseif (isset($_GET['action']) && $_GET['action'] === 'fetchPageCountAndFormats') { // fetch the page count and the page size if (!array_key_exists($_GET['file'], $files)) { throw new Exception('Invalid file!'); } $file = $files[$_GET['file']]; $document = \SetaPDF_Core_Document::loadByFilename($file); $pages = $document->getCatalog()->getPages(); $pageCount = $pages->count(); $pageFormats = []; for ($i = 1; $i <= $pageCount; $i++) { $page = $pages->getPage($i); list($width, $height) = $page->getWidthAndHeight(); $pageFormats[] = [$width, $height]; } if ($pageCount === 0) { throw new Exception('PDF is empty'); } header('Content-Type: application/json'); header('Expires: 0'); header('Cache-Control: must-revalidate, post-check=0, pre-check=0'); header('Pragma: public'); header('Accept-Ranges: none'); $content = json_encode([ 'pageCount' => $pageCount, 'pageFormats' => $pageFormats, ]); header('Content-Length: ' . strlen($content)); echo $content; return; } elseif (isset($_GET['action']) && $_GET['action'] === 'extract') { // extract text by selected locations if (!array_key_exists($_GET['file'], $files)) { throw new Exception('Invalid file!'); } $file = $files[$_GET['file']]; $page = $_GET['page']; // upper left point $x1 = $_GET['data']['x1']; $y1 = $_GET['data']['y1']; // lower right point $x2 = $_GET['data']['x2']; $y2 = $_GET['data']['y2']; // load the document $document = \SetaPDF_Core_Document::loadByFilename($file); // get access to its pages $pages = $document->getCatalog()->getPages(); // the interresting part: initiate an extractor instance $extractor = new \SetaPDF_Extractor($document); // create a word strategy instance $strategy = new \SetaPDF_Extractor_Strategy_ExactPlain(); // pass a rectangle filter to the strategy $strategy->setFilter(new \SetaPDF_Extractor_Filter_Rectangle( new \SetaPDF_Core_Geometry_Rectangle($x1, $y1, $x2, $y2), \SetaPDF_Extractor_Filter_Rectangle::MODE_CONTACT )); $extractor->setStrategy($strategy); // get the text of a page $result = $extractor->getResultByPageNumber($page); header('Content-Type: application/json'); header('Expires: 0'); header('Cache-Control: must-revalidate, post-check=0, pre-check=0'); header('Pragma: public'); header('Accept-Ranges: none'); $content = json_encode([ 'result' => htmlspecialchars($result), ]); header('Content-Length: ' . strlen($content)); echo $content; return; } else { $filePath = displayFiles($files); $file = array_search($filePath, $files); if ($file === false) { throw new Exception('Invalid file selected'); } require './gui.php'; }
<?php if (!isset($dpi, $file, $basePath)) { die(); } $script = $_SERVER['SCRIPT_NAME']; ?> <html lang="en"> <head> <script src="https://cdn.jsdelivr.net/jquery/2.2.4/jquery.min.js" integrity="sha256-BbhdlvQf/xTY9gja0Dq3HiwQF8LaCRTXxZKRutelT44=" crossorigin="anonymous" ></script> <script type="text/javascript" src="https://cdn.jsdelivr.net/jquery.blockui/2.70.0/jquery.blockUI.min.js" integrity="sha256-9wSYpoBdTOlj3azv4n74Mlb984+xKfTS7dhcYRqSqMA=" crossorigin="anonymous" ></script> <link rel="stylesheet" type="text/css" href="https://cdn.jsdelivr.net/jquery.jcrop/0.9.12/css/jquery.Jcrop.min.css" integrity="sha256-/fCoT6hQHsrj1J/wn7oNqgWmtm9alQ2QRwWm2B0Fo1o=" crossorigin="anonymous"/> <script src="https://cdn.jsdelivr.net/jquery.jcrop/0.9.12/js/jquery.Jcrop.min.js" integrity="sha256-ZxCBLDyBkvv5I47GMz1THCbcQ00JR0BvWlqWUEXupKI=" crossorigin="anonymous" ></script> </head> <body> <table> <tr> <td> <fieldset class="pageCount" style="border: 0;"></fieldset> <div class="imageContainer" style="border: 1px solid #d3d3d3;"></div> </td> <td style="vertical-align: top; padding: 5px;"> <div class="extractedText"></div> </td> </tr> </table> <script type="text/javascript"> $(function() { var actualPage = 1, isLoading = false, dpi = <?=$dpi?>, file = '<?=$file?>', pageFormats, jcrop; $.blockUI.defaults.message = '<img src="<?=$basePath?>layout/img/ajax-loader-big.gif" />'; $.extend($.blockUI.defaults.css, { backgroundColor: 'transparent', border: 'none', color: '#fff' }); var initImage = function () { if (jcrop) { jcrop.destroy(); } $('div.extractedText').empty(); $('.imageContainer').empty() // note: this controller generates an image of the pdf + page number .html('<img class="demoImage" src="<?=$script?>' + '?action=generateImagePreview&file=' + file + '&page=' + actualPage + '"/>'); $('img.demoImage') .load(function () { $('.demoImage').Jcrop({ onSelect: function(c) { if (isLoading) { return; } var height = pageFormats[actualPage - 1][1]; var dpiFactor = 1/72 * dpi; c.y = height - c.y / dpiFactor; c.y2 = height - c.y2 / dpiFactor; c.x = c.x / dpiFactor; c.x2 = c.x2 / dpiFactor; $.blockUI(); $.ajax({ url : '<?=$script?>', type : 'GET', cache : false, data: 'action=extract&file=' + file + '&page=' + actualPage + '&data[x1]=' + c.x + '&data[y1]=' + c.y + '&data[x2]=' + c.x2 + '&data[y2]=' + c.y2, }).done(function(result) { try { var extractedText = $('div.extractedText'); extractedText.empty(); extractedText.html('<h3>Script Output:</h3><pre>' + result.result + '</pre>'); } catch(error) { console.error(error); } $.unblockUI(); }).fail(function(error) { console.error(error.responseText); $.unblockUI(); }); }, onRelease: function() { $('a[href="#code"]').addClass('disabled'); var extractedText = $('div.extractedText'); extractedText.empty(); } }, function () { jcrop = this; }); }); }; if (isLoading) { return; } isLoading = true; actualPage = 1; $('div.extractedText').empty(); $.blockUI(); $.ajax({ url: '<?=$script?>', type: 'GET', cache: false, data: 'action=fetchPageCountAndFormats&file=' + file, }).done(function(result) { isLoading = false; $.unblockUI(); var fieldset = $('fieldset.pageCount'); fieldset.empty(); pageFormats = result.pageFormats; var pageNumberSelect = '<label for="pageNumber" style="margin-right: 5px;">Page number:</label><select name="data[page]" id="pageNumber">'; for (var i = 1; i <= result.pageCount; i++) { pageNumberSelect += '<option value="' + i + '"'+ (i == actualPage ? ' selected="selected"' : '') +'>' + i + '</option>'; } pageNumberSelect += '</select>'; fieldset.html(pageNumberSelect); $('select#pageNumber', fieldset).change(function() { actualPage = $(this).val(); initImage(); }); initImage(); }).fail(function(error) { isLoading = false; console.error(error.responseText); $.unblockUI(); }); }); </script> </body> </html>