Filter by Annotations
This demo gets all text markup annotations and creates rectangle filter instances based on their quadpoint array.
The instances are added to a multi filter then. The result is a mapping of text markup annotations to the underlaying text.
PHP
<?php
use setasign\SetaPDF2\Core\Document;
use setasign\SetaPDF2\Core\Document\Page\Annotation\Annotation;
use setasign\SetaPDF2\Core\Document\Page\Annotation\TextMarkupAnnotation;
use setasign\SetaPDF2\Core\Geometry\Rectangle;
use setasign\SetaPDF2\Extractor\Extractor;
use setasign\SetaPDF2\Extractor\Filter\MultiFilter;
use setasign\SetaPDF2\Extractor\Filter\RectangleFilter;
use setasign\SetaPDF2\Extractor\Strategy\ExactPlainStrategy;
// load and register the autoload function
require_once __DIR__ . '/../../../../../bootstrap.php';
$files = [
$assetsDirectory . '/pdfs/camtown/Terms-and-Conditions - revised.pdf',
$assetsDirectory . '/pdfs/Brand-Guide - with-comments.pdf',
];
$path = displayFiles($files);
// create a document instance
$document = Document::loadByFilename($path);
// initate an extractor instance
$extractor = new Extractor($document);
// get page documents pages object
$pages = $document->getCatalog()->getPages();
// we are going to save the extracted text in this variable
$results = [];
// map pages and filter names to annotation instances
$annotationsByPageAndFilterName = [];
// iterate over all pages
for ($pageNo = 1, $pageCount = $pages->count(); $pageNo <= $pageCount; $pageNo++) {
// get the page object
$page = $pages->getPage($pageNo);
// get the annotations
$annotations = array_filter(
$page->getAnnotations()->getAll(),
static function (Annotation $annotation) {
switch ($annotation->getType()) {
case Annotation::TYPE_HIGHLIGHT:
case Annotation::TYPE_STRIKE_OUT:
case Annotation::TYPE_CARET:
case Annotation::TYPE_UNDERLINE:
return true;
}
return false;
}
);
// create a strategy instance
$strategy = new ExactPlainStrategy();
// create a multi filter instance
$filter = new MultiFilter();
// and pass it to the strategy
$strategy->setFilter($filter);
// iterate over all highlight annotations
foreach ($annotations AS $tmpId => $annotation) {
/**
* @var TextMarkupAnnotation $annotation
*/
$name = 'P#' . $pageNo . '/TMA#' . $tmpId;
if ($annotation->getName()) {
$name .= ' (' . $annotation->getName() . ')';
}
if ($annotation instanceof TextMarkupAnnotation) {
// iterate over the quad points to setup our filter instances
$quadPoints = $annotation->getQuadPoints();
for ($pos = 0, $c = count($quadPoints); $pos < $c; $pos += 8) {
$llx = min($quadPoints[$pos + 0], $quadPoints[$pos + 2], $quadPoints[$pos + 4], $quadPoints[$pos + 6]) - 1;
$urx = max($quadPoints[$pos + 0], $quadPoints[$pos + 2], $quadPoints[$pos + 4], $quadPoints[$pos + 6]) + 1;
$lly = min($quadPoints[$pos + 1], $quadPoints[$pos + 3], $quadPoints[$pos + 5], $quadPoints[$pos + 7]) - 1;
$ury = max($quadPoints[$pos + 1], $quadPoints[$pos + 3], $quadPoints[$pos + 5], $quadPoints[$pos + 7]) + 1;
// reduce it to a small line
$diff = ($ury - $lly) / 2;
$lly = $lly + $diff - 1;
$ury = $ury - $diff - 1;
// Add a new rectangle filter to the multi filter instance
$filter->addFilter(
new RectangleFilter(
new Rectangle($llx, $lly, $urx, $ury),
RectangleFilter::MODE_CONTACT,
$name
)
);
}
}
$annotationsByPageAndFilterName[$pageNo][$name] = $annotation;
}
// if no filters for this page defined, ignore it
if (count($filter->getFilters()) === 0) {
continue;
}
// pass the strategy to the extractor instance
$extractor->setStrategy($strategy);
// and get the results by the current page number
$result = $extractor->getResultByPageNumber($pageNo);
if ($result === '') {
continue;
}
$results[$pageNo] = $result;
}
// debug output
foreach ($annotationsByPageAndFilterName AS $pageNo => $annotations) {
echo '<h1>Page No #' . $pageNo . '</h1>';
echo '<table border="1" width="100%"><tr><th>Name</th><th>Text</th><th>Subject</th><th>Comment</th></tr>';
foreach ($annotations AS $name => $annotation) {
echo '<tr>';
echo '<td>' . htmlspecialchars($name) . '</td>';
echo '<td><pre>' . htmlspecialchars($results[$pageNo][$name] ?? '') . '</pre></td>';
echo '<td><pre>' . htmlspecialchars($annotation->getSubject() ?? '') . '</pre></td>';
echo '<td><pre>' . htmlspecialchars($annotation->getContents() ?? '') . '</pre></td>';
echo '</tr>';
}
echo '</table>';
}
