Multiple improvments :

* nextToken() can return comments if needed (used for %%EOF)
  * Handle object after %%EOF without line return
  * Handle trailer without xref
  * Handle parenthesis in string objects
  * Fix error on stream parsing (fallback to big chunk read)
This commit is contained in:
Grégory Soutadé 2021-09-28 14:35:45 +02:00
parent 7f2a84c10c
commit ea551b6f52
2 changed files with 89 additions and 44 deletions

View File

@ -81,10 +81,10 @@ namespace uPDFParser
private: private:
void parseObject(std::string& token); void parseObject(std::string& token);
void parseStartXref(); void parseStartXref();
void parseXref(); bool parseXref();
void parseTrailer(); bool parseTrailer();
std::string nextToken(bool exceptionOnEOF=true); std::string nextToken(bool exceptionOnEOF=true, bool readComment=false);
DataType* parseType(std::string& token, Object* object, std::map<std::string, DataType*>& dict); DataType* parseType(std::string& token, Object* object, std::map<std::string, DataType*>& dict);
void parseDictionary(Object* object, std::map<std::string, DataType*>& dict); void parseDictionary(Object* object, std::map<std::string, DataType*>& dict);

View File

@ -111,7 +111,7 @@ namespace uPDFParser
/** /**
* @brief Find next token to analyze * @brief Find next token to analyze
*/ */
std::string Parser::nextToken(bool exceptionOnEOF) std::string Parser::nextToken(bool exceptionOnEOF, bool readComment)
{ {
char c = 0, prev_c; char c = 0, prev_c;
std::string res(""); std::string res("");
@ -134,8 +134,30 @@ namespace uPDFParser
// Comment, skip line // Comment, skip line
if (c == '%') if (c == '%')
{ {
if (readComment)
{
curOffset = lseek(fd, 0, SEEK_CUR)-1;
res += c;
while (true)
{
if (read(fd, &c, 1) != 1)
{
if (exceptionOnEOF)
EXCEPTION(TRUNCATED_FILE, "Unexpected end of file");
break;
}
if (c == '\n' || c == '\r')
break;
res += c;
}
break;
}
finishLine(fd); finishLine(fd);
break; if (res.size())
break;
else
continue;
} }
// White character while empty result, continue // White character while empty result, continue
@ -217,17 +239,22 @@ namespace uPDFParser
void Parser::parseStartXref() void Parser::parseStartXref()
{ {
std::string token; std::string token;
char buffer[10];
// std::cout << "Parse startxref" << std::endl; // std::cout << "Parse startxref" << std::endl;
token = nextToken(); token = nextToken(); // XREF offset
readline(fd, buffer, sizeof(buffer), false); token = nextToken(false, true); // %%EOF
if (strncmp(buffer, "%%EOF", 5)) if (strncmp(token.c_str(), "%%EOF", 5))
EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset); EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
/*
Handle special case where we have :
%%EOF1 0 obj\n
*/
if (token.size() > 5)
lseek(fd, curOffset+5, SEEK_SET);
} }
void Parser::parseTrailer() bool Parser::parseTrailer()
{ {
std::string token; std::string token;
@ -241,16 +268,22 @@ namespace uPDFParser
parseDictionary(&trailer, trailer.dictionary().value()); parseDictionary(&trailer, trailer.dictionary().value());
token = nextToken(); token = nextToken();
/* trailer without xref */
if (token != "startxref") if (token != "startxref")
EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset); {
lseek(fd, curOffset, SEEK_SET);
return false;
}
parseStartXref(); parseStartXref();
return true;
} }
void Parser::parseXref() bool Parser::parseXref()
{ {
std::string token; std::string token;
bool res = false;
// std::cout << "Parse xref" << std::endl; // std::cout << "Parse xref" << std::endl;
xrefOffset = curOffset; xrefOffset = curOffset;
@ -260,10 +293,12 @@ namespace uPDFParser
if (token == "trailer") if (token == "trailer")
{ {
parseTrailer(); res = parseTrailer();
break; break;
} }
} }
return res;
} }
static DataType* tokenToNumber(std::string& token, char sign='\0') static DataType* tokenToNumber(std::string& token, char sign='\0')
@ -405,16 +440,26 @@ namespace uPDFParser
std::string res(""); std::string res("");
char c; char c;
bool escaped = false; bool escaped = false;
int parenthesis_count = 1; /* Handle parenthesis in parenthesis */
while (1) while (1)
{ {
if (read(fd, &c, 1) != 1) if (read(fd, &c, 1) != 1)
break; break;
if (c == ')' && !escaped) if (c == '(' && !escaped)
parenthesis_count++;
else if (c == ')' && !escaped)
parenthesis_count--;
if (c == ')' && !escaped && parenthesis_count == 0)
break; break;
escaped = (c == '\\'); /* Handle \\ */
if (c == '\\' && escaped)
escaped = false;
else
escaped = (c == '\\');
res += c; res += c;
} }
@ -457,37 +502,37 @@ namespace uPDFParser
EXCEPTION(INVALID_STREAM, "No Length property at offset " << curOffset); EXCEPTION(INVALID_STREAM, "No Length property at offset " << curOffset);
DataType* Length = (*object)["Length"]; DataType* Length = (*object)["Length"];
if (Length->type() != DataType::INTEGER) // Try with a direct jump if no filter applied (Flatedecode)
if (!object->hasKey("Filter") && Length->type() == DataType::INTEGER)
{ {
if (Length->type() != DataType::REFERENCE) Integer* length = (Integer*)Length;
EXCEPTION(INVALID_STREAM, "Invalid Length property at offset " << curOffset); endOffset = startOffset + length->value();
lseek(fd, endOffset, SEEK_SET);
token = nextToken();
// Don't want to parse xref table... if (token == "endstream")
while (1) return new Stream(startOffset, endOffset);
{
char buffer[4*1024]; // No endstream, come back at the begining
int ret; lseek(fd, startOffset, SEEK_SET);
endOffset = lseek(fd, 0, SEEK_CUR);
ret = readline(fd, buffer, sizeof(buffer));
if (!strncmp(buffer, "endstream", 9))
{
lseek(fd, -(ret-9), SEEK_CUR);
break;
}
}
return new Stream(startOffset, endOffset);
} }
Integer* length = (Integer*)Length; // Don't want to parse xref table...
endOffset = startOffset + length->value(); while (1)
lseek(fd, endOffset, SEEK_SET); {
token = nextToken(); char buffer[4*1024];
char* subs;
if (token != "endstream") int ret;
EXCEPTION(INVALID_STREAM, "endstream not found at offset " << endOffset); ret = readline(fd, buffer, sizeof(buffer));
subs = (char*)memmem((void*)buffer, ret, (void*)"endstream", 9);
// std::cout << "end parseStream" << std::endl; if (subs)
{
unsigned long pos = (unsigned long)subs - (unsigned long)buffer;
lseek(fd, -(ret-pos-9), SEEK_CUR);
endOffset = lseek(fd, 0, SEEK_CUR);
break;
}
}
return new Stream(startOffset, endOffset); return new Stream(startOffset, endOffset);
} }
@ -553,7 +598,7 @@ namespace uPDFParser
object = new Object(objectId, generationNumber, offset); object = new Object(objectId, generationNumber, offset);
_objects.push_back(object); _objects.push_back(object);
while (1) while (1)
{ {
token = nextToken(); token = nextToken();