Increase PDF format support (more test samples used)

This commit is contained in:
Grégory Soutadé 2021-09-09 20:49:00 +02:00
parent f243271215
commit 89e9fdc55e
3 changed files with 85 additions and 13 deletions

View File

@ -80,6 +80,7 @@ namespace uPDFParser
private: private:
void parseObject(std::string& token); void parseObject(std::string& token);
void parseStartXref();
void parseXref(); void parseXref();
void parseTrailer(); void parseTrailer();

View File

@ -35,7 +35,7 @@ namespace uPDFParser
class DataType class DataType
{ {
public: public:
enum TYPE {BOOLEAN, INTEGER, REAL, NAME, STRING, HEXASTRING, REFERENCE, ARRAY, DICTIONARY, STREAM}; enum TYPE {BOOLEAN, INTEGER, REAL, NAME, STRING, HEXASTRING, REFERENCE, ARRAY, DICTIONARY, STREAM, NULLOBJECT};
DataType(TYPE _type): DataType(TYPE _type):
_type(_type) _type(_type)
@ -248,6 +248,20 @@ namespace uPDFParser
private: private:
int startOffset, endOffset; int startOffset, endOffset;
}; };
class Null : public DataType
{
public:
Null():
DataType(DataType::TYPE::NULLOBJECT)
{}
virtual DataType* clone() {return new Null();}
bool value() {return 0;}
virtual std::string str() { return "null";}
private:
};
} }
#endif #endif

View File

@ -64,7 +64,17 @@ namespace uPDFParser
} }
if (c == '\n' || c == '\r') if (c == '\n' || c == '\r')
break; {
// Empty line
if (!res)
{
size++ ;
res--;
continue;
}
else
break;
}
buffer[res] = c; buffer[res] = c;
} }
@ -87,9 +97,15 @@ namespace uPDFParser
if (read(fd, &c, 1) != 1) if (read(fd, &c, 1) != 1)
break; break;
if (c == '\n') if (c == '\n' || c == '\r')
break; break;
} }
// Support \r\n and \n\r
if (read(fd, &c, 1) == 1)
{
if (c != '\n' && c != '\r')
lseek(fd, -1, SEEK_CUR);
}
} }
/** /**
@ -97,15 +113,17 @@ namespace uPDFParser
*/ */
std::string Parser::nextToken(bool exceptionOnEOF) std::string Parser::nextToken(bool exceptionOnEOF)
{ {
char c; char c = 0, prev_c;
std::string res(""); std::string res("");
int i; int i;
static const char delims[] = " \t<>[]()+-/"; static const char delims[] = " \t<>[]()/";
static const char whitespace_prev_delims[] = "+-"; // Need whitespace before
static const char start_delims[] = "<>[]()"; static const char start_delims[] = "<>[]()";
bool found = false; bool found = false;
while (!found) while (!found)
{ {
prev_c = c;
if (read(fd, &c, 1) != 1) if (read(fd, &c, 1) != 1)
{ {
if (exceptionOnEOF) if (exceptionOnEOF)
@ -146,6 +164,20 @@ namespace uPDFParser
} }
} }
// Push character until delimiter is found
if (!found && prev_c == ' ')
{
for (i=0; i<(int)sizeof(whitespace_prev_delims); i++)
{
if (c == whitespace_prev_delims[i])
{
lseek(fd, -1, SEEK_CUR);
found = true;
break;
}
}
}
if (!found) if (!found)
res += c; res += c;
} }
@ -182,11 +214,23 @@ namespace uPDFParser
return res; return res;
} }
void Parser::parseTrailer() void Parser::parseStartXref()
{ {
std::string token; std::string token;
char buffer[10]; char buffer[10];
// std::cout << "Parse startxref" << std::endl;
token = nextToken();
readline(fd, buffer, sizeof(buffer), false);
if (strncmp(buffer, "%%EOF", 5))
EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
}
void Parser::parseTrailer()
{
std::string token;
// std::cout << "Parse trailer" << std::endl; // std::cout << "Parse trailer" << std::endl;
token = nextToken(); token = nextToken();
@ -200,10 +244,7 @@ namespace uPDFParser
if (token != "startxref") if (token != "startxref")
EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset); EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
token = nextToken(); parseStartXref();
readline(fd, buffer, sizeof(buffer), false);
if (strncmp(buffer, "%%EOF", 5))
EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
} }
void Parser::parseXref() void Parser::parseXref()
@ -329,6 +370,8 @@ namespace uPDFParser
return new Boolean(true); return new Boolean(true);
else if (token == "false") else if (token == "false")
return new Boolean(false); return new Boolean(false);
else if (token == "null")
return new Null();
else else
EXCEPTION(INVALID_TOKEN, "Invalid token " << token << " at offset " << curOffset); EXCEPTION(INVALID_TOKEN, "Invalid token " << token << " at offset " << curOffset);
@ -536,7 +579,8 @@ namespace uPDFParser
{ {
char buf[16]; char buf[16];
std::string token; std::string token;
bool secondLine = true;
if (fd) if (fd)
close(fd); close(fd);
@ -546,7 +590,7 @@ namespace uPDFParser
EXCEPTION(UNABLE_TO_OPEN_FILE, "Unable to open " << filename << " (%m)"); EXCEPTION(UNABLE_TO_OPEN_FILE, "Unable to open " << filename << " (%m)");
// Check %PDF at startup // Check %PDF at startup
readline(fd, buf, 4); readline(fd, buf, 4, false);
if (strncmp(buf, "%PDF", 4)) if (strncmp(buf, "%PDF", 4))
EXCEPTION(INVALID_HEADER, "Invalid PDF header"); EXCEPTION(INVALID_HEADER, "Invalid PDF header");
finishLine(fd); finishLine(fd);
@ -572,8 +616,21 @@ namespace uPDFParser
parseXref(); parseXref();
else if (token[0] >= '1' && token[0] <= '9') else if (token[0] >= '1' && token[0] <= '9')
parseObject(token); parseObject(token);
// Can have startxref without trailer (not end of document)
else if (token == "startxref")
parseStartXref();
else else
EXCEPTION(INVALID_LINE, "Invalid Line at offset " << curOffset); {
// The second line may be not commented and invalid (for UTF8 stuff)
if (!secondLine)
{
EXCEPTION(INVALID_LINE, "Invalid Line at offset " << curOffset);
}
else
finishLine(fd);
}
// If for optimization
if (secondLine) secondLine = false;
} }
close(fd); close(fd);