nas_docker_compose/kodbox/site/app/kod/FileParsePdf.class.php

<?php
/*
* @link http://kodcloud.com/
* @author warlee | e-mail:kodcloud@qq.com
* @copyright warlee 2014.(Shanghai)Co.,Ltd
* @license http://kodcloud.com/tools/license/license.txt
*/


/**
 * 解析获取pdf文件信息;
 *
 * pdfparser https://www.pdfparser.org/documentation
 * mpdf编辑: http://mpdf.github.io/
 */
class FileParsePdf{
	public static function parse($filePath){
		$chunkSize	= 32 * 1024;//trailer处理;
		$fileInfo   = array(
			'fp'		=> fopen($filePath,'r'),
			'path'		=> $filePath,
			'size'		=> filesize_64($filePath),
			'chunkSize' => $chunkSize,
		);
		$fileInfo['dataStart'] = StreamWrapperIO::read($filePath,0,$chunkSize);
		$fileInfo['dataEnd']   = StreamWrapperIO::read($filePath,$fileInfo['size'] - $chunkSize,$chunkSize);
		// if($_GET['debug'] == '1'){
		// 	include('/Library/WebServer/Documents/localhost/test/000/test/pdfparser-0.18.1/vendor/autoload.php');
		// 	$parser = new \Smalot\PdfParser\Parser();
		// 	$pdf = $parser->parseFile($filePath);pr($pdf->getDetails());exit;
		// }

		$xref = self::decodeXref($fileInfo);
		if($xref){
			$infoKey  = $xref['trailer']['info'];
			$dataInfo = self::getObjectValue($fileInfo,$xref,$infoKey);
		}

		$dataInfo = is_array($dataInfo) ? $dataInfo : array();
		// 页面尺寸处理;
		$dataInfo['sizeWidth'] = 0;
		$theReg = '/[\s]*\/MediaBox[\s]*\[[\s]*([0-9\.]+)[\s]+([0-9\.]+)[\s]+([0-9\.]+)[\s]+([0-9\.]+)[\s]*\]/i';
		preg_match($theReg,$fileInfo['dataStart'],$matches);
		if (!$dataInfo['sizeWidth'] && count($matches) == 5){
			$dataInfo['sizeWidth']  = $matches[3];
			$dataInfo['sizeHeight'] = $matches[4];
		}
		preg_match($theReg,$fileInfo['dataEnd'],$matches);
		if (!$dataInfo['sizeWidth'] && count($matches) == 5){
			$dataInfo['sizeWidth']  = $matches[3];
			$dataInfo['sizeHeight'] = $matches[4];
		}
		preg_match('/%PDF-([0-9\.]+)/',$fileInfo['dataStart'],$matches);
		if($matches){$dataInfo['version'] = $matches[1];}

		// // 页数计算处理; /Count 8
		$dataInfo['pageNumber'] = 0;
		$theReg = "/[\s]*\/Count[\s]+([0-9]+)[\s]*/i";

		preg_match_all($theReg,$fileInfo['dataStart'],$matches);
		if($matches[1] && $dataInfo['pageNumber'] < $matches[1][0]){
			$dataInfo['pageNumber'] = $matches[1][0];
		}
		preg_match_all($theReg,$fileInfo['dataEnd'],$matches);
		if($matches[1] && $dataInfo['pageNumber'] < $matches[1][0]){
			$dataInfo['pageNumber'] = $matches[1][0];
		}

		$dataInfo = self::parseInfoItem($dataInfo);
		return $dataInfo;
	}
	private static function parseInfoItem($dataInfo){
		if(!$dataInfo) return false;
		$picker = array( //数值统一筛选并处理;
			'title' 		 => array('Title',''),				// 标题
			'auther'	 	 => array('Author',''),				// 作者
			'createTime'	 => array('CreationDate','date'),	// 创建日期
			'modifyTime' 	 =>	array('ModDate','date'),		// 修改日期
			'pageNumber'	 => array('pageNumber','int'),		// 页数
			'sizeWidth'		 => array('sizeWidth','int'),		// 页面宽度
			'sizeHeight'	 => array('sizeHeight','int'),		// 页面高度
			'creator'	 	 => array('Creator',''),			// 内容创作者
			'producer'	 	 => array('Producer',''),			// 编码软件
			'pdfVersion'	 => array('version',''),			// PDF 版本;
		);

		$result = array();
		foreach ($picker as $key => $info){
			if(!isset($dataInfo[$info[0]])) continue;
			$value = $dataInfo[$info[0]];
			if(!$value || is_array($value)) continue;

			switch($info[1]){
				case 'int' :$value = intval($value);break;
				case 'date':
					if(substr($value,0,2) == 'D:') {
						$value = substr($value,2,14);
					}
					if(strtotime($value)){
						$value = date('Y-m-d H:i:s',strtotime($value));
					}
					break;
			}
			$result[$key] = $value;
		}
		// pr($result,$dataInfo);exit;
		return $result;
	}

	private static function decodeXref(&$fileInfo){
		$pdfData = $fileInfo['dataEnd'];
		$xref   = array('trailer'=>array(),'xref'=>array());
		$theReg = '/[\r\n]startxref[\s]*[\r\n]*([0-9]+)[\s]*[\r\n]+%%EOF/i';
		if(!preg_match_all($theReg,$pdfData, $matches,PREG_SET_ORDER,0)) return false;

		// 结尾block索引开始位置,比最小block小则加大;
		$startxref = intval($matches[0][1]);
		if($fileInfo['size'] - $startxref > $fileInfo['chunkSize']){
			$chunkSize = 4 * $fileInfo['chunkSize'];
			$fileInfo['chunkSize'] = $chunkSize;
			$fileInfo['dataStart'] = StreamWrapperIO::read($fileInfo['path'],0,$chunkSize);
			$fileInfo['dataEnd']   = StreamWrapperIO::read($fileInfo['path'],$fileInfo['size'] - $chunkSize,$chunkSize);
			$pdfData = $fileInfo['dataEnd'];
		}

		$objNum = 0;
		// preg_match_all('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/',$pdfData,$matches);
		preg_match_all('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/',$pdfData,$matches);
		foreach ($matches[3] as $i=>$item){
			if($matches[3][$i] == 'n'){
				$index = $objNum.'_'.intval($matches[2][$i]);
				$xref['xref'][$index] = intval($matches[1][$i]);
                ++$objNum;
            }else if ($matches[3][$i] == 'f') {
                ++$objNum;
            } else {
				// $objNum = intval($matches[1][$i]); //从1开始;
            }
		}

		// 没有索引的情况处理; 优先按照的的进行匹配;
		if(preg_match_all('/[\r\n]([0-9]+)[\s]+([0-9]+)[\s]+obj/iU',$pdfData, $matches)){
			$fileOffset = $fileInfo['size'] - $fileInfo['chunkSize'];
			foreach ($matches[0] as $i => $theValue) {
				$key   = $matches[1][$i].'_'.$matches[2][$i];
				$xref['xref'][$key] = strpos($pdfData,$theValue) + $fileOffset + 1;
			}
		}

		if(preg_match_all('/trailer[\s]*<<(.*)>>/isU',$pdfData,$matches)){
			$trailerData = count($matches[1]) == 1 ? $matches[1][0] : $matches[1][1];
		}else{// 兼容没有trailer情况的数据; 直接从文件最后正则匹配查找;
			$trailerData = substr($pdfData, -1024*5);
		}
		if (preg_match('/Size[\s]+([0-9]+)/i', $trailerData, $matches) > 0) {
			$xref['trailer']['size'] = intval($matches[1]);
		}
		if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailerData, $matches) > 0) {
			$xref['trailer']['root'] = intval($matches[1]).'_'.intval($matches[2]);
		}
		if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailerData, $matches) > 0) {
			$xref['trailer']['encrypt'] = intval($matches[1]).'_'.intval($matches[2]);
		}
		if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailerData, $matches) > 0) {
			$xref['trailer']['info'] = intval($matches[1]).'_'.intval($matches[2]);
		}
		if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailerData, $matches) > 0) {
			$xref['trailer']['id'] = array();
			$xref['trailer']['id'][0] = $matches[1];
			$xref['trailer']['id'][1] = $matches[2];
		}
		if(!$xref['trailer']['info']) return false;
		if (preg_match('/Prev[\s]+([0-9]+)/i', $trailerData, $matches) > 0) {
			// $xref = self::decodeXref($pdfData,intval($matches[1]), $xref);
		}
		return $xref;
	}

	private static function getObjectValue($fileInfo,$xref,$infoKey){
		$dataInfoRef  = self::getObject($fileInfo,$xref['xref'][$infoKey]);
		// pr($infoKey,$dataInfoRef);

		if(is_string($dataInfoRef[1])) return $dataInfoRef[1];
		if(!is_array($dataInfoRef[1])) return array();
		$dataInfo = array();
		for ($i = 0; $i< count($dataInfoRef[1]);$i+=2){
			$itemKey 	= $dataInfoRef[1][$i];
			$itemValue 	= $dataInfoRef[1][$i+1];
			if(count($itemKey) == 3 && $itemKey[0] == '/'){
				$value = false;
				if($itemValue[0] == 'objref'){
					$itemValue = self::getObject($fileInfo,$xref['xref'][$itemValue[1]]);
				}
				$value = $itemValue[1];
				if($value === false) continue;
				if(is_string($value)){
					$value = self::decodeStr($value);
				}
				$dataInfo[$itemKey[1]] = $value;
			}
		}
		return $dataInfo;
	}
	private static function getObject($fileInfo,$offset){
		$dataIndex = self::getObjectItem($fileInfo,$offset);
		$dataIndex = self::getObjectItem($fileInfo,$dataIndex[2]);
		return $dataIndex;
	}
	private static function getObjectItem($fileInfo,$offset){
		// return self::getRawObject($fileInfo['dataEnd'],$offset);
		$chunkSize  = $fileInfo['chunkSize'];
		$fileOffset = $fileInfo['size'] - $chunkSize;
		$thePose = $offset >= $fileOffset ? ($offset - $fileOffset) : $offset;
		$theData = $offset >= $fileOffset ? $fileInfo['dataEnd']: $fileInfo['dataStart'];
		if($offset > $chunkSize && $offset <= $fileOffset ){
			$thePose = 0;
			$theData = StreamWrapperIO::read($fileInfo['path'],$offset,$chunkSize);
			// pr("getFile:$offset;$chunkSize",substr($theData,200));
		}
		$dataIndex = self::getRawObject($theData,$thePose);

		// 重置offset;
		if($offset >= $fileOffset){
			$dataIndex[2] = $dataIndex[2] + $fileOffset;
		}else if($offset > $chunkSize && $offset <= $fileOffset){
			$dataIndex[2] = $dataIndex[2] + $offset;
		}
		// pr('getObjectItem:',[$offset,$chunkSize,$fileOffset,$thePose],$dataIndex);
		return $dataIndex;
	}

	// 解析节点数据;(xxx xx obj)
	private static function decodeStr($text){
		$text = str_replace(
			['\\\\', '\\ ', '\\/', '\(', '\)', '\n', '\r', '\t'],
			['\\',   ' ',   '/',   '(',  ')',  "\n", "\r", "\t"],$text);

		$parts = preg_split('/(\\\\\d{3})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
        $text = '';
        foreach ($parts as $part) {
            if (preg_match('/^\\\\\d{3}$/', $part)) {
                $text .= \chr(octdec(trim($part, '\\')));
            } else {
                $text .= $part;
            }
		}
		$parts = preg_split('/(#\d{2})/s', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
        $text = '';
        foreach ($parts as $part) {
            if (preg_match('/^#\d{2}$/', $part)) {
                $text .= \chr(hexdec(trim($part, '#')));
            } else {
                $text .= $part;
            }
		}

		$parts = preg_split('/(<[a-f0-9]+>)/si', $text, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
		$text = '';
        foreach ($parts as $part) {
            if (preg_match('/^<.*>$/s', $part) && false === stripos($part, '<?xml')) {
                $part = preg_replace("/[\r\n]/", '', $part);
                $part = trim($part, '<>');
                $part = pack('H*', $part);
                $text .= $part;
            } else {
                $text .= $part;
            }
		}
		if(preg_match('/^\xFE\xFF/i', $text)) {
            // Strip U+FEFF byte order marker.
            $decode = substr($text, 2);
            $text = '';
            $length = strlen($decode);
            for ($i = 0; $i < $length; $i += 2) {
				$hex   = hexdec(bin2hex(substr($decode, $i, 2)));
				$text .= mb_convert_encoding('&#'.intval($hex).';', 'UTF-8', 'HTML-ENTITIES');
            }
		}
		return $text;
	}
	private static function getRawObject($pdfData, $offset = 0){
        $objtype = ''; // object type to be returned
        $objval = ''; // object value to be returned
        /*
         * skip initial white space chars:
         *      \x00 null (NUL)
         *      \x09 horizontal tab (HT)
         *      \x0A line feed (LF)
         *      \x0C form feed (FF)
         *      \x0D carriage return (CR)
         *      \x20 space (SP)
         */
        $offset += strspn($pdfData, "\x00\x09\x0a\x0c\x0d\x20", $offset);
		$char = $pdfData[$offset];
		// echo "<pre>";var_dump($char,'pos='.$offset.';len='.strlen($pdfData),substr($pdfData,$offset,33));echo "</pre>";

        switch ($char) {
            case '%':  // \x25 PERCENT SIGN
                    // skip comment and search for next token
                    $next = strcspn($pdfData, "\r\n", $offset);
                    if ($next > 0) {
                        $offset += $next;
                        return self::getRawObject($pdfData, $offset);
                    }
                    break;
            case '/':  // \x2F SOLIDUS
                    $objtype = $char;
                    ++$offset;
                    $pregResult = preg_match(
                        '/^([^\x00\x09\x0a\x0c\x0d\x20\s\x28\x29\x3c\x3e\x5b\x5d\x7b\x7d\x2f\x25]+)/',
                        substr($pdfData, $offset, 256),
                        $matches
                    );
                    if (1 == $pregResult) {
                        $objval = $matches[1]; // unescaped value
                        $offset += strlen($objval);
                    }
                    break;
            case '(':   // \x28 LEFT PARENTHESIS
            case ')':  // \x29 RIGHT PARENTHESIS
                    // literal string object
                    $objtype = $char;
                    ++$offset;
                    $strpos = $offset;
                    if ('(' == $char) {
                        $open_bracket = 1;
                        while ($open_bracket > 0) {
							if (!isset($pdfData[$strpos])) break;
                            $ch = $pdfData[$strpos];
                            switch ($ch) {
                                case '\\':  // REVERSE SOLIDUS (5Ch) (Backslash)
									// skip next character
									++$strpos;
									break;
                                case '(':  // LEFT PARENHESIS (28h)
									++$open_bracket;
									break;
                                case ')':  // RIGHT PARENTHESIS (29h)
									--$open_bracket;
									break;
                            }
                            ++$strpos;
                        }
                        $objval = substr($pdfData, $offset, ($strpos - $offset - 1));
                        $offset = $strpos;
                    }
                    break;
            case '[':   // \x5B LEFT SQUARE BRACKET
            case ']':  // \x5D RIGHT SQUARE BRACKET
                // array object
                $objtype = $char;
                ++$offset;
                if ('[' == $char) {
                    // get array content
                    $objval = array();
                    do {
                        $oldOffset = $offset;
                        // get element
                        $element = self::getRawObject($pdfData, $offset);
                        $offset = $element[2];
                        $objval[] = $element;
                    } while ((']' != $element[0]) && ($offset != $oldOffset));
                    // remove closing delimiter
                    array_pop($objval);
                }
                break;
            case '<':  // \x3C LESS-THAN SIGN
            case '>':  // \x3E GREATER-THAN SIGN
                if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) {
                    // dictionary object
                    $objtype = $char.$char;
                    $offset += 2;
                    if ('<' == $char) {
                        $objval = array();
                        do {
                            $oldOffset = $offset;
                            // get element
                            $element = self::getRawObject($pdfData, $offset);
                            $offset = $element[2];
                            $objval[] = $element;
                        } while (('>>' != $element[0]) && ($offset != $oldOffset));
                        // remove closing delimiter
                        array_pop($objval);
                    }
                } else {
                    // hexadecimal string object
                    $objtype = $char;
                    ++$offset;
                    $pregResult = preg_match('/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU',substr($pdfData, $offset),$matches);
                    if (('<' == $char) && 1 == $pregResult) {
                        // remove white space characters
                        $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", '');
                        $offset += \strlen($matches[0]);
                    } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) {
                        $offset = $endpos + 1;
                    }
                }
                break;
            default:
				if ('endobj' == substr($pdfData, $offset, 6)) {
					// indirect object
					$objtype = 'endobj';
					$offset += 6;
				} elseif ('null' == substr($pdfData, $offset, 4)) {
					// null object
					$objtype = 'null';
					$offset += 4;
					$objval = 'null';
				} elseif ('true' == substr($pdfData, $offset, 4)) {
					// boolean true object
					$objtype = 'boolean';
					$offset += 4;
					$objval = 'true';
				} elseif ('false' == substr($pdfData, $offset, 5)) {
					// boolean false object
					$objtype = 'boolean';
					$offset += 5;
					$objval = 'false';
				} elseif ('stream' == substr($pdfData, $offset, 6)) {
					// start stream object
					$objtype = 'stream';
					$offset += 6;
					if (1 == preg_match('/^([\r]?[\n])/isU', substr($pdfData, $offset), $matches)) {
						$offset += strlen($matches[0]);
						$endStreamReg = '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU';
						$pregResult = preg_match($endStreamReg,substr($pdfData, $offset),$matches,PREG_OFFSET_CAPTURE);
						if (1 == $pregResult) {
							$objval = substr($pdfData, $offset, $matches[0][1]);
							$offset += $matches[1][1];
						}
					}
				} elseif ('endstream' == substr($pdfData, $offset, 9)) {
					// end stream object
					$objtype = 'endstream';
					$offset += 9;
				} elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+R/iU', substr($pdfData, $offset, 33), $matches)) {
					$objtype = 'objref';
					$offset += strlen($matches[0]);
					$objval = intval($matches[1]).'_'.intval($matches[2]);
				} elseif (1 == preg_match('/^([0-9]+)[\s]+([0-9]+)[\s]+obj/iU', substr($pdfData, $offset, 33), $matches)) {
					$objtype = 'obj';
					$objval = intval($matches[1]).'_'.intval($matches[2]);
					$offset += strlen($matches[0]);
				} elseif (($numlen = strspn($pdfData, '+-.0123456789', $offset)) > 0) {
					$objtype = 'numeric';
					$objval = substr($pdfData, $offset, $numlen);
					$offset += $numlen;
				}
				break;
        }
        return array($objtype, $objval, $offset);
    }
}