Get and Mark Words
This demo uses two rectangle filters to filter the invoicing party and the invoice number. For demonstration the filter and the found words are marked in the resulting PDF document.
PHP
<?php // load and register the autoload function require_once __DIR__ . '/../../../../../bootstrap.php'; $files = glob($assetsDirectory . '/pdfs/*/eBook-Invoice.pdf'); $path = displayFiles($files); $document = \SetaPDF_Core_Document::loadByFilename($path); // initiate an extractor instance $extractor = new \SetaPDF_Extractor($document); // create a word strategy $strategy = new \SetaPDF_Extractor_Strategy_Word(); // define filter areas $invoicingPartyFilter = new \SetaPDF_Extractor_Filter_Rectangle( new \SetaPDF_Core_Geometry_Rectangle(40, 705, 220, 720), \SetaPDF_Extractor_Filter_Rectangle::MODE_CONTACT, 'invoicingParty' ); // define filter areas $invoiceNoFilter = new \SetaPDF_Extractor_Filter_Rectangle( new \SetaPDF_Core_Geometry_Rectangle(512, 520, 580, 540), \SetaPDF_Extractor_Filter_Rectangle::MODE_CONTACT, 'invoiceNo' ); // pass them to the strategy $strategy->setFilter(new \SetaPDF_Extractor_Filter_Multi([$invoicingPartyFilter, $invoiceNoFilter])); // set the strategy $extractor->setStrategy($strategy); // get the result /** @var \SetaPDF_Extractor_Result_Words $words */ $words = $extractor->getResultByPageNumber(1); // mark the filter areas and words on the pages canvas $canvas = $document->getCatalog()->getPages()->getPage(1)->getCanvas(); // draw the filter rectangles $rect = $invoiceNoFilter->getRectangle(); $canvas ->setStrokingColor([1, 0, 1]) ->draw()->rect($rect->getLl()->getX(), $rect->getLl()->getY(), $rect->getWidth(), $rect->getHeight()); $rect = $invoicingPartyFilter->getRectangle(); $canvas ->setStrokingColor([1, 0, 1]) ->draw()->rect($rect->getLl()->getX(), $rect->getLl()->getY(), $rect->getWidth(), $rect->getHeight()); // draw the word boundaries /** @var \SetaPDF_Extractor_Result_Word $word */ foreach ($words AS $word) { // to get access to the filter id which was used to resolve this word, just use: // $filterId = $word->getFilterId(); foreach ($word->getBounds() AS $boundary) { $canvas ->setStrokingColor([0, 1, 0]) ->draw()->rect( $boundary->getLl()->getX(), $boundary->getLl()->getY(), $boundary->getUr()->getX() - $boundary->getLl()->getX(), $boundary->getUr()->getY() - $boundary->getLl()->getY() ); } } $document->setWriter(new \SetaPDF_Core_Writer_Http('document.pdf', true)); $document->save()->finish();