Filter by Form Fields
This demo extracts data of various documents of flat PDF forms (non dynamic forms).
The field information are resolved by a real dynamic PDF form template (aka AcroForm).
PHP
<?php // load and register the autoload function require_once __DIR__ . '/../../../../../bootstrap.php'; // Let's load a template in which we'd drawn simple form fields to get the names and // coordinates of the areas we want to extract $template = \SetaPDF_Core_Document::loadByFilename($assetsDirectory . '/pdfs/Subscription-tekMag-form-template.pdf'); $pages = $template->getCatalog()->getPages(); // group the found fields by pages $fieldsPerPage = []; for ($pageNo = 1, $pageCount = $pages->count(); $pageNo <= $pageCount; $pageNo++) { $fieldsPerPage[$pageNo] = []; $page = $pages->getPage($pageNo); $annotations = $page->getAnnotations(); // get all widget annotations $widgetAnnotations = $annotations->getAll(\SetaPDF_Core_Document_Page_Annotation::TYPE_WIDGET); foreach ($widgetAnnotations AS $widgetAnnotation) { $fieldName = \SetaPDF_Core_Document_Catalog_AcroForm::resolveFieldName($widgetAnnotation->getDictionary()); $fieldsPerPage[$pageNo][$fieldName] = $widgetAnnotation->getRect()->getRectangle(); } } // clean up $template->cleanUp(); unset($page, $pages, $template); // let's extract the data from these filese... foreach ([ $assetsDirectory . '/pdfs/lenstown/Subscription-tekMag-filled-flat.pdf', $assetsDirectory . '/pdfs/camtown/Subscription-tekMag-filled-flat.pdf', $assetsDirectory . '/pdfs/etown/Subscription-tekMag-filled-flat.pdf', ] AS $path) { echo '<h1>' . htmlspecialchars(substr($path, strlen($assetsDirectory . '/pdfs/'))) . '</h1>'; // load the document $document = \SetaPDF_Core_Document::loadByFilename($path); // create a plain strategy $strategy = new \SetaPDF_Extractor_Strategy_Plain(); // create an extractor instance $extractor = new \SetaPDF_Extractor($document, $strategy); // iterate through the pages we want to extract data from. foreach ($fieldsPerPage AS $pageNo => $fields) { // define a multi filter $filter = new \SetaPDF_Extractor_Filter_Multi(); // create additional rectangle filters named by the found fields and ... foreach ($fields AS $name => $rect) { $fieldFilter = new \SetaPDF_Extractor_Filter_Rectangle( $rect, \SetaPDF_Extractor_Filter_Rectangle::MODE_CONTACT, $name ); // ...pass them to the multi filter $filter->addFilter($fieldFilter); } // set the filter $strategy->setFilter($filter); // get the result $result = $extractor->getResultByPageNumber($pageNo); // clean up the result because the values lay on top of other text which needs to be removed $result = array_map(function ($s) { $s = str_replace("\xef\x82\xa8", '', $s); return trim($s, "\n."); }, $result); echo "<pre>"; var_dump($result); echo "</pre>"; } }