search for a string in a pdf file using php

20,536

using the link from the previous response I changed the code a little in order fill my needs, which were SEARCH AND REPLACE, the functions from the article don't need additional libraries but you do need the FPDI library for the REPLACE part.

Now this code isn't perfect either but it's a good start for anyone searching something similar.

The main problem this script has is that the new data is overwritten over a kind of image of the original pdf template, this means that if your are trying to replace 2 lines of text with 5 lines of text it wont insert the text, but overwrite the 2 lines plus 3 more. It also means that whatever you try to overwrite will have to have a matching background.

But for some basic needs, like for example having a nice business proposal or quote on pdf like a template, this script can overwrite the date or client name, all within the php app.

function decodeAsciiHex($input) {
    $output = "";

    $isOdd = true;
    $isComment = false;

    for($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++) {
        $c = $input[$i];

        if($isComment) {
            if ($c == '\r' || $c == '\n')
                $isComment = false;
            continue;
        }

        switch($c) {
            case '\0': case '\t': case '\r': case '\f': case '\n': case ' ': break;
            case '%': 
                $isComment = true;
            break;

            default:
                $code = hexdec($c);
                if($code === 0 && $c != '0')
                    return "";

                if($isOdd)
                    $codeHigh = $code;
                else
                    $output .= chr($codeHigh * 16 + $code);

                $isOdd = !$isOdd;
            break;
        }
    }

    if($input[$i] != '>')
        return "";

    if($isOdd)
        $output .= chr($codeHigh * 16);

    return $output;
}

function decodeAscii85($input) {
    $output = "";

    $isComment = false;
    $ords = array();

    for($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++) {
        $c = $input[$i];

        if($isComment) {
            if ($c == '\r' || $c == '\n')
                $isComment = false;
            continue;
        }

        if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
            continue;
        if ($c == '%') {
            $isComment = true;
            continue;
        }
        if ($c == 'z' && $state === 0) {
            $output .= str_repeat(chr(0), 4);
            continue;
        }
        if ($c < '!' || $c > 'u')
            return "";

        $code = ord($input[$i]) & 0xff;
        $ords[$state++] = $code - ord('!');

        if ($state == 5) {
            $state = 0;
            for ($sum = 0, $j = 0; $j < 5; $j++)
                $sum = $sum * 85 + $ords[$j];
            for ($j = 3; $j >= 0; $j--)
                $output .= chr($sum >> ($j * 8));
        }
    }
    if ($state === 1)
        return "";
    elseif ($state > 1) {
        for ($i = 0, $sum = 0; $i < $state; $i++)
            $sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
        for ($i = 0; $i < $state - 1; $i++)
            $ouput .= chr($sum >> ((3 - $i) * 8));
    }

    return $output;
}

function decodeFlate($input) {
    return @gzuncompress($input);
}

function getObjectOptions($object) {
    $options = array();
    if (preg_match("#<<(.*)>>#ismU", $object, $options)) {
        $options = explode("/", $options[1]);
        @array_shift($options);

        $o = array();
        for ($j = 0; $j < @count($options); $j++) {
            $options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
            if (strpos($options[$j], " ") !== false) {
                $parts = explode(" ", $options[$j]);
                $o[$parts[0]] = $parts[1];
            } else
                $o[$options[$j]] = true;
        }
        $options = $o;
        unset($o);
    }

    return $options;
}
function getDecodedStream($stream, $options) {
    $data = "";
    if (empty($options["Filter"]))
        $data = $stream;
    else {
        $length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
        $_stream = substr($stream, 0, $length);

        foreach ($options as $key => $value) {
            if ($key == "ASCIIHexDecode")
                $_stream = decodeAsciiHex($_stream);
            if ($key == "ASCII85Decode")
                $_stream = decodeAscii85($_stream);
            if ($key == "FlateDecode")
                $_stream = decodeFlate($_stream);
        }
        $data = $_stream;
    }
    return $data;
}
function getDirtyTexts(&$texts, $textContainers) {
    for ($j = 0; $j < count($textContainers); $j++) {
        if (preg_match_all("#\[(.*)\]\s*TJ#ismU", $textContainers[$j], $parts))
            $texts = array_merge($texts, @$parts[1]);
        elseif(preg_match_all("#Td\s*(\(.*\))\s*Tj#ismU", $textContainers[$j], $parts))
            $texts = array_merge($texts, @$parts[1]);
    }
}
function getCharTransformations(&$transformations, $stream) {
    preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
    preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);

    for ($j = 0; $j < count($chars); $j++) {
        $count = $chars[$j][1];
        $current = explode("\n", trim($chars[$j][2]));
        for ($k = 0; $k < $count && $k < count($current); $k++) {
            if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
                $transformations[str_pad($map[1], 4, "0")] = $map[2];
        }
    }
    for ($j = 0; $j < count($ranges); $j++) {
        $count = $ranges[$j][1];
        $current = explode("\n", trim($ranges[$j][2]));
        for ($k = 0; $k < $count && $k < count($current); $k++) {
            if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map)) {
                $from = hexdec($map[1]);
                $to = hexdec($map[2]);
                $_from = hexdec($map[3]);

                for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
                    $transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
            } elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map)) {
                $from = hexdec($map[1]);
                $to = hexdec($map[2]);
                $parts = preg_split("#\s+#", trim($map[3]));

                for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
                    $transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
            }
        }
    }
}
function getTextUsingTransformations($texts, $transformations) {
    $document = "";
    for ($i = 0; $i < count($texts); $i++) {
        $isHex = false;
        $isPlain = false;

        $hex = "";
        $plain = "";
        for ($j = 0; $j < strlen($texts[$i]); $j++) {
            $c = $texts[$i][$j];
            switch($c) {
                case "<":
                    $hex = "";
                    $isHex = true;
                break;
                case ">":
                    $hexs = str_split($hex, 4);
                    for ($k = 0; $k < count($hexs); $k++) {
                        $chex = str_pad($hexs[$k], 4, "0");
                        if (isset($transformations[$chex]))
                            $chex = $transformations[$chex];
                        $document .= html_entity_decode("&#x".$chex.";");
                    }
                    $isHex = false;
                break;
                case "(":
                    $plain = "";
                    $isPlain = true;
                break;
                case ")":
                    $document .= $plain;
                    $isPlain = false;
                break;
                case "\\":
                    $c2 = $texts[$i][$j + 1];
                    if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2;
                    elseif ($c2 == "n") $plain .= '\n';
                    elseif ($c2 == "r") $plain .= '\r';
                    elseif ($c2 == "t") $plain .= '\t';
                    elseif ($c2 == "b") $plain .= '\b';
                    elseif ($c2 == "f") $plain .= '\f';
                    elseif ($c2 >= '0' && $c2 <= '9') {
                        $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
                        $j += strlen($oct) - 1;
                        $plain .= html_entity_decode("&#".octdec($oct).";");
                    }
                    $j++;
                break;

                default:
                    if ($isHex)
                        $hex .= $c;
                    if ($isPlain)
                        $plain .= $c;
                break;
            }
        }
        $document .= "\n";
    }

    return $document;
}

function pdf2text($filename, $search) {
    $pageNumber = 1;
    $infile = @file_get_contents($filename, FILE_BINARY);
    if (empty($infile))
        return "";

    $transformations = array();    

    preg_match_all("#obj(.*)endobj#ismU", $infile, $objects);
    $objects = @$objects[1];

    for ($i = 0; $i < count($objects); $i++) {
        $texts = array();
        $currentObject = $objects[$i];

        if (preg_match("#stream(.*)endstream#ismU", $currentObject, $stream)) {
            $stream = ltrim($stream[1]);

            $options = getObjectOptions($currentObject);
            if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])))
                continue;

            $data = getDecodedStream($stream, $options); 
            if (strlen($data)) {
                if (preg_match_all("#BT(.*)ET#ismU", $data, $textContainers)) {
                    //print_r($textContainers);
                    $textContainers = @$textContainers[1];
                    //print_r($textContainers);print "<br><br>";
                    getDirtyTexts($texts, $textContainers);
                    //print_r($textContainers);
                }else{
                    getCharTransformations($transformations, $data);
                }
            }

            //print "HOJA = ".$pageNumber;
            //$textTransformed = getTextUsingTransformations($texts, $transformations);
            //print $textTransformed."<br><br><br>";
            //print_r($currentObject);
            //print_r($options);
            //print_r($stream);
            //print_r($data);
            //print ($textContainers[0])."<br>";
            //print_r($textContainers);
            //print_r($texts);            
            //print "<br><br><br>";

            $m = array();            
            foreach ($texts as $key => $value) {
                $prearray = array($value);
                //print_r($textContainers[$key]);
                preg_match("/\/\F[0-9]\s[0-9][0-9]/",$textContainers[$key],$m);
                if(!empty($m)){
                    $fontSize = array();
                    $fontSize = explode(" ", $m[0]);
                    $fontSize = array_filter($fontSize);
                    //print_r($fontSize);
                }

                $textTransformed = getTextUsingTransformations($prearray, $transformations);
                //print $textTransformed."<br><br><br>";
                $pos = strpos($textTransformed, $search);
                if($pos !== false){
                    $data = NULL;
                    $pos = strpos($textContainers[$key], "Tm");
                    if($pos !== false){
                        $data = substr($textContainers[$key], 0, $pos);
                    }else{
                        $pos = strpos($textContainers[$key], "Td");
                        if($pos !== false){
                            $data = substr($textContainers[$key], 0, $pos);
                        }
                    }
                    if(!empty($data)){
                        $dataArray = array();
                        $dataArray = explode(" ", $data);
                        $dataArray = array_filter($dataArray);

                        $returnArray = array();
                        $returnArray['keyword'] = $search;
                        $returnArray['page'] = $pageNumber;
                        $returnArray['font'] = $fontSize[1];
                        $returnArray['x'] = $dataArray[count($dataArray)];
                        $returnArray['y'] = $dataArray[count($dataArray)+1];                        

                        return $returnArray;
                    }
                }
            }
            $pageNumber++;
        }
    }

    //return getTextUsingTransformations($texts, $transformations);
    return false;
}

Now for the usage:

What I did was saving the pdf template file with some "keywords" which will be overwritten, for example in order to change the date on the template PDF I placed the keyword @DATE where I wanted, and so on for other keywords.

The modified pdf2text function returns false if the search word your looking is not found on the pdf. If it is found it returns an array with the searched keyword, the page on which it was found, the font size of the word, and the X and Y value.

    // initiate FPDI
$pdf = new FPDI('P', 'pt', 'A4');
$pdf->setPrintHeader(false);

// set the source file
$pdffile = "template.pdf";
$numberOfPages = $pdf->setSourceFile($pdffile);

$theReturnedFecha = pdf2text($pdffile, "@DATE");
$theReturnedCliente = pdf2text($pdffile, "@CLIENT");


// import all pages
for($i = 1; $i <= $numberOfPages; $i++){
    $pdf->AddPage();
    $tplIdx = $pdf->importPage($i);
    $pdf->useTemplate($tplIdx);
    if($theReturnedFecha !== false){
        if($theReturnedFecha['page'] == $i){
            $pdf->SetFont('Helvetica');
            $pdf->SetFontSize($theReturnedFecha['font']);
            $pdf->SetTextColor(0, 0, 0);
            $pdf->SetFillColor(255,255,255);
            $pdf->SetXY($theReturnedFecha['x'], ($theReturnedFecha['y']*-1)-$theReturnedFecha['font']);
            $pdf->Cell(0, 0, dateInSpanish("2015-11-04"), 0, 0, 'L', true);
        }
        //print_r($theReturnedFecha);
    }

    if($theReturnedCliente !== false){
        if($theReturnedCliente['page'] == $i){
            $pdf->SetFont('Helvetica');
            $pdf->SetFontSize($theReturnedCliente['font']);
            $pdf->SetTextColor(0, 0, 0);
            $pdf->SetFillColor(255,255,255);
            $pdf->SetXY($theReturnedCliente['x'], ($theReturnedCliente['y']*-1)-$theReturnedCliente['font']);
            $pdf->Cell(0, 0, $x_nombre_cli, 0, 0, 'L', true);
        }
        //print_r($theReturnedFecha);
    }        
}

$pdf->Output();

Hope it helps anyone

bye

Share:
20,536
Tabrez Ahmed
Author by

Tabrez Ahmed

Updated on December 23, 2020

Comments

  • Tabrez Ahmed
    Tabrez Ahmed over 3 years

    I want to search a string from a pdf file in a way strstr() does. But when a pdf is read as plain text, it gives you a non-understandable junk. How do I do it?

    Perhaps pdflib has some solution, but my hosting provider doesn't help me install it.

  • DeveloperChris
    DeveloperChris over 7 years
    Thanks for this. unfortunately it doesn't appear to work pdf2text fails to return any reference. What pdf version did you test it on?