uPDFParser

uPDFParser Commit Details

Date:2021-09-09 20:49:00 (1 month 8 days ago)
Author:Grégory Soutadé
Branch:master
Commit:89e9fdc55ed78002e62c834646d281bf1320099a
Parents: f2432712159b22b57f82c5450ad79421c4c4cd9c
Message:Increase PDF format support (more test samples used)

Changes:
Minclude/uPDFParser.h (1 diff)
Minclude/uPDFTypes.h (2 diffs)
Msrc/uPDFParser.cpp (10 diffs)

File differences

include/uPDFParser.h
8080
8181
8282
83
8384
8485
8586
private:
void parseObject(std::string& token);
void parseStartXref();
void parseXref();
void parseTrailer();
include/uPDFTypes.h
3535
3636
3737
38
38
3939
4040
4141
......
248248
249249
250250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
251265
252266
253267
class DataType
{
public:
enum TYPE {BOOLEAN, INTEGER, REAL, NAME, STRING, HEXASTRING, REFERENCE, ARRAY, DICTIONARY, STREAM};
enum TYPE {BOOLEAN, INTEGER, REAL, NAME, STRING, HEXASTRING, REFERENCE, ARRAY, DICTIONARY, STREAM, NULLOBJECT};
DataType(TYPE _type):
_type(_type)
private:
int startOffset, endOffset;
};
class Null : public DataType
{
public:
Null():
DataType(DataType::TYPE::NULLOBJECT)
{}
virtual DataType* clone() {return new Null();}
bool value() {return 0;}
virtual std::string str() { return "null";}
private:
};
}
#endif
src/uPDFParser.cpp
6464
6565
6666
67
67
68
69
70
71
72
73
74
75
76
77
6878
6979
7080
......
8797
8898
8999
90
100
91101
92102
103
104
105
106
107
108
93109
94110
95111
......
97113
98114
99115
100
116
101117
102118
103
119
120
104121
105122
106123
107124
108125
126
109127
110128
111129
......
146164
147165
148166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
149181
150182
151183
......
182214
183215
184216
185
217
186218
187219
188220
189221
222
223
224
225
226
227
228
229
230
231
232
233
190234
191235
192236
......
200244
201245
202246
203
204
205
206
247
207248
208249
209250
......
329370
330371
331372
373
374
332375
333376
334377
......
536579
537580
538581
539
582
583
540584
541585
542586
......
546590
547591
548592
549
593
550594
551595
552596
......
572616
573617
574618
619
620
621
575622
576
623
624
625
626
627
628
629
630
631
632
633
577634
578635
579636
}
if (c == '\n' || c == '\r')
break;
{
// Empty line
if (!res)
{
size++ ;
res--;
continue;
}
else
break;
}
buffer[res] = c;
}
if (read(fd, &c, 1) != 1)
break;
if (c == '\n')
if (c == '\n' || c == '\r')
break;
}
// Support \r\n and \n\r
if (read(fd, &c, 1) == 1)
{
if (c != '\n' && c != '\r')
lseek(fd, -1, SEEK_CUR);
}
}
/**
*/
std::string Parser::nextToken(bool exceptionOnEOF)
{
char c;
char c = 0, prev_c;
std::string res("");
int i;
static const char delims[] = " \t<>[]()+-/";
static const char delims[] = " \t<>[]()/";
static const char whitespace_prev_delims[] = "+-"; // Need whitespace before
static const char start_delims[] = "<>[]()";
bool found = false;
while (!found)
{
prev_c = c;
if (read(fd, &c, 1) != 1)
{
if (exceptionOnEOF)
}
}
// Push character until delimiter is found
if (!found && prev_c == ' ')
{
for (i=0; i<(int)sizeof(whitespace_prev_delims); i++)
{
if (c == whitespace_prev_delims[i])
{
lseek(fd, -1, SEEK_CUR);
found = true;
break;
}
}
}
if (!found)
res += c;
}
return res;
}
void Parser::parseTrailer()
void Parser::parseStartXref()
{
std::string token;
char buffer[10];
// std::cout << "Parse startxref" << std::endl;
token = nextToken();
readline(fd, buffer, sizeof(buffer), false);
if (strncmp(buffer, "%%EOF", 5))
EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
}
void Parser::parseTrailer()
{
std::string token;
// std::cout << "Parse trailer" << std::endl;
token = nextToken();
if (token != "startxref")
EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
token = nextToken();
readline(fd, buffer, sizeof(buffer), false);
if (strncmp(buffer, "%%EOF", 5))
EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
parseStartXref();
}
void Parser::parseXref()
return new Boolean(true);
else if (token == "false")
return new Boolean(false);
else if (token == "null")
return new Null();
else
EXCEPTION(INVALID_TOKEN, "Invalid token " << token << " at offset " << curOffset);
{
char buf[16];
std::string token;
bool secondLine = true;
if (fd)
close(fd);
EXCEPTION(UNABLE_TO_OPEN_FILE, "Unable to open " << filename << " (%m)");
// Check %PDF at startup
readline(fd, buf, 4);
readline(fd, buf, 4, false);
if (strncmp(buf, "%PDF", 4))
EXCEPTION(INVALID_HEADER, "Invalid PDF header");
finishLine(fd);
parseXref();
else if (token[0] >= '1' && token[0] <= '9')
parseObject(token);
// Can have startxref without trailer (not end of document)
else if (token == "startxref")
parseStartXref();
else
EXCEPTION(INVALID_LINE, "Invalid Line at offset " << curOffset);
{
// The second line may be not commented and invalid (for UTF8 stuff)
if (!secondLine)
{
EXCEPTION(INVALID_LINE, "Invalid Line at offset " << curOffset);
}
else
finishLine(fd);
}
// If for optimization
if (secondLine) secondLine = false;
}
close(fd);

Archive Download the corresponding diff file

Branches