SetaPDF Demos

Invoice Recipients and Numbers

This demo takes a bunch if invoices and extracts the address field and the invoice number. After that it groups the resolved information in an array by the invoice recipient.

PHP
<?php

// load and register the autoload function
require_once __DIR__ . '/../../../../../bootstrap.php';

$files = glob($assetsDirectory . '/pdfs/tektown/invoices/[0-9]*.pdf');

// prepare the resulting array
$invoicesByCustomerName = [];

foreach ($files AS $file) {
    // initiate a document instance
    $document = SetaPDF_Core_Document::loadByFilename($file);

    // initiate an extractor instance
    $extractor = new SetaPDF_Extractor($document);

    // get the plain strategy shich is the default strategy
    $strategy = $extractor->getStrategy();

    // define a rectangle filter for the invoice recipient name
    $recipientNameFilter = new SetaPDF_Extractor_Filter_Rectangle(
        new SetaPDF_Core_Geometry_Rectangle(40, 665, 260, 700),
        SetaPDF_Extractor_Filter_Rectangle::MODE_CONTACT,
        'recipient'
    );

    // define another rectangle filter for the invoice number
    $invoiceNofilter = new SetaPDF_Extractor_Filter_Rectangle(
        new SetaPDF_Core_Geometry_Rectangle(512, 520, 580, 540),
        SetaPDF_Extractor_Filter_Rectangle::MODE_CONTACT,
        'invoiceNo'
    );

    // pass the filters to the strategy by using a filter chain
    $strategy->setFilter(new SetaPDF_Extractor_Filter_Multi([$recipientNameFilter, $invoiceNofilter]));

    // now walk through the pages and ...
    $pages = $document->getCatalog()->getPages();
    for ($pageNo = 1; $pageNo <= $pages->count(); $pageNo++) {

        // extract the content found by the specific filters.
        $result = $extractor->getResultByPageNumber($pageNo);

        $invoiceNo = $result['invoiceNo'];
        $recipient = $result['recipient'];

        // create single lines of the recipient
        $recipient = explode("\n", $recipient);

        // the name can be found in the first item
        $name = array_shift($recipient);
        // the optinal company name is left over
        $companyName = array_shift($recipient);

        // create a unique key
        $key = $name . '|' . $companyName;

        // save the name and company data and prepare the reuslt
        if (!isset($invoicesByCustomerName[$key])) {
            $invoicesByCustomerName[$key] = [
                'name'        => $name,
                'companyName' => $companyName,
                'invoices'    => []
            ];
        }

        // add the invoice and page number to the result
        $invoicesByCustomerName[$key]['invoices'][] = [
            'invoiceNo' => $invoiceNo,
            'pageNo'    => $pageNo,
            'file'  => $file
        ];
    }

    // release memory
    $extractor->cleanUp();
    $document->cleanUp();
}

// output the resolved data:
foreach ($invoicesByCustomerName AS $customerData) {
    echo '<h1>Customer: ' . htmlentities($customerData['name']) . ' / '
        . htmlentities($customerData['companyName']) . '</h1>';

    echo '<ul>';
    foreach($customerData['invoices'] AS $invoice) {
        echo '<li>Invoice Number #' . htmlentities($invoice['invoiceNo'])
            . ' on page #' . $invoice['pageNo'] . ' in ';
        echo htmlspecialchars(substr($invoice['file'], strlen($assetsDirectory . '/pdfs/')));
        echo '</li>';
    }

    echo '</ul>';
}