Initial commit

2021-08-21 18:22:58 +02:00 · 2021-08-21 18:22:58 +02:00 · 39e2f6ecc9
commit 39e2f6ecc9
10 changed files with 1561 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,5 @@
+*~
+libupdfparser.a
+libupdfparser.so
+
+
--- a/165
+++ b/165
@ -0,0 +1,165 @@
+                   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
--- a/53
+++ b/53
@ -0,0 +1,53 @@
+
+AR ?= $(CROSS)ar
+CXX ?= $(CROSS)g++
+
+CXXFLAGS=-Wall -fPIC -I./include
+LDFLAGS=
+
+BUILD_STATIC ?= 0
+BUILD_SHARED ?= 1
+
+TARGETS =
+ifneq (BUILD_STATIC, 0)
+  TARGETS += libupdfparser.a
+endif
+ifneq (BUILD_SHARED, 0)
+  TARGETS += libupdfparser.so
+endif
+
+ifneq ($(DEBUG),)
+CXXFLAGS += -ggdb -O0
+else
+CXXFLAGS += -O2
+endif
+
+SRCDIR      := src
+INCDIR      := inc
+BUILDDIR    := obj
+TARGETDIR   := bin
+SRCEXT      := cpp
+OBJEXT      := o
+
+SOURCES = src/uPDFParser.cpp src/uPDFTypes.cpp
+OBJECTS     := $(patsubst $(SRCDIR)/%,$(BUILDDIR)/%,$(SOURCES:.$(SRCEXT)=.$(OBJEXT)))
+
+all: obj $(TARGETS)
+
+obj:
+	mkdir obj
+
+$(BUILDDIR)/%.$(OBJEXT): $(SRCDIR)/%.$(SRCEXT)
+	$(CXX) $(CXXFLAGS) -c $^ -o $@
+
+libupdfparser.a: $(OBJECTS)
+	$(AR) crs $@ obj/*.o
+
+libupdfparser.so: $(OBJECTS)
+	$(CXX) obj/*.o $(LDFLAGS) -o $@ -shared
+
+test: test.c libupdfparser.a
+	g++ -ggdb -O0 $^ -o $@ -Iinclude libupdfparser.a
+
+clean:
+	rm -rf libupdfparser.so libupdfparser.a obj
--- a/README.md
+++ b/README.md
@ -0,0 +1,33 @@
+Introduction
+------------
+
+A very simple PDF parser that will load PDF objects without interpretation (zlib, streams, string encoding...).
+It currently only allows to update PDF file with new objects.
+
+
+Compilation
+-----------
+
+Use _make_ command
+
+    make [CROSS=XXX] [DEBUG=1] [BUILD_STATIC=(0|1)] [BUILD_SHARED=(0|1)]
+
+CROSS can define a cross compiler prefix (ie arm-linux-gnueabihf-)
+
+DEBUG can be set to compile in DEBUG mode
+
+BUILD_STATIC build libupdfparser.a if 1, nothing if 0 (default value), can be combined with BUILD_SHARED
+
+BUILD_SHARED build libupdfparser.so if 1 (default value), nothing if 0, can be combined with BUILD_STATIC
+
+
+Copyright
+---------
+
+Grégory Soutadé
+
+
+License
+-------
+
+LGPL v3 or later
--- a/include/uPDFObject.h
+++ b/include/uPDFObject.h
@ -0,0 +1,150 @@
+/*
+  Copyright 2021 Grégory Soutadé
+
+  This file is part of uPDFParser.
+
+  uPDFParser is free software: you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  uPDFParser is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with uPDFParser. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _UPDFOBJECT_HPP_
+#define _UPDFOBJECT_HPP_
+
+#include "uPDFTypes.h"
+
+namespace uPDFParser
+{
+    /**
+     * @brief PDF Object
+     */
+    class Object
+    {
+    public:
+	Object():
+	    _objectId(0), _generationNumber(0),
+	    offset(0), _isNew(false), indirectOffset(0)
+	{}
+
+	/**
+	 * @brief Object constructor
+	 *
+	 * @param objectId          Object ID
+	 * @param generationNumber  Object generation number
+	 * @param offset            Offset of object in current PDF file
+	 * @param isNew             false if object has been read from file,
+	 *                          true if it has been created or updated
+	 * @param indirectOffset    Object is indirect
+	 */
+	Object(int objectId, int generationNumber, uint64_t offset, bool isNew=false,
+	       off_t indirectOffset=0):
+	    _objectId(objectId), _generationNumber(generationNumber),
+	    offset(offset), _isNew(isNew), indirectOffset(indirectOffset)
+	{}
+
+	~Object()
+	{
+	    std::vector<DataType*>::iterator it;
+	    for(it=_data.begin(); it!=_data.end(); it++)
+		delete *it;
+	}
+
+	Object(const Object& other)
+	{
+	    _objectId = other._objectId;
+	    _generationNumber = other._generationNumber;
+	    offset = other.offset;
+	    indirectOffset = other.indirectOffset;
+	    _isNew = true;
+
+	    std::vector<DataType*>::const_iterator it;
+	    for(it=other._data.begin(); it!=other._data.end(); it++)
+		_data.push_back((*it)->clone());
+
+	    const std::map<std::string, DataType*> _dict = ((Dictionary)other._dictionary).value();
+	    std::map<std::string, DataType*>& _myDict = _dictionary.value();
+	    std::map<std::string, DataType*>::const_iterator it2;
+	    for(it2=_dict.begin(); it2!=_dict.end(); it2++)
+		_myDict[it2->first] = it2->second->clone();
+	}
+
+	/**
+	 * @brief Clone current object (call copy constructor)
+	 */
+	Object* clone() { return new Object(*this); }
+
+	/**
+	 * @brief Return internal dictionary
+	 */
+	Dictionary& dictionary() {return _dictionary;}
+
+	/**
+	 * @brief Return vector of data contained into object
+	 */
+	std::vector<DataType*>& data() {return _data;}
+
+	/**
+	 * @brief Object string representation
+	 */
+	std::string str();
+
+	/**
+	 * @brief Set object as indirect if offset != 0 or not indirect if offset == 0
+	 */
+	void setIndirectOffset(off_t offset) {indirectOffset = offset;}
+
+	/**
+	 * @brief is object indirect (indirectOffset != 0)
+	 */
+	bool isIndirect() {return indirectOffset != 0;}
+
+	/**
+	 * @brief Get dictionary value
+	 */
+	DataType*& operator[](const std::string& key) { return _dictionary.value()[key]; }
+
+	/**
+	 * @brief Check for key in object's dictionary
+	 */
+	bool hasKey(const std::string& key) { return _dictionary.value().count(key)?true:false; }
+
+	/**
+	 * @brief is object new (or not updated) ?
+	 */
+	bool isNew() { return _isNew; }
+
+	/**
+	 * @brief Mark object as updated
+	 */
+	void update(void) { _isNew = true; }
+
+	/**
+	 * @brief Return object's id
+	 */
+	int objectId() { return _objectId; }
+
+	/**
+	 * @brief Return object's generation number
+	 */
+	int generationNumber() { return _generationNumber; }
+	
+    private:
+	int _objectId;
+	int _generationNumber;
+	off_t offset;
+	bool _isNew;
+	off_t indirectOffset;
+	Dictionary _dictionary;
+	std::vector<DataType*> _data;
+    };
+}
+#endif
--- a/include/uPDFParser.h
+++ b/include/uPDFParser.h
@ -0,0 +1,109 @@
+/*
+  Copyright 2021 Grégory Soutadé
+
+  This file is part of uPDFParser.
+
+  uPDFParser is free software: you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  uPDFParser is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with uPDFParser. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _UPDFPARSER_HPP_
+#define _UPDFPARSER_HPP_
+
+#include <exception>
+#include <map>
+#include <vector>
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <iomanip>
+#include <string.h>
+#include <unistd.h>
+
+#include "uPDFTypes.h"
+#include "uPDFObject.h"
+
+namespace uPDFParser
+{
+    /**
+     * @brief PDF Parser
+     */
+    class Parser
+    {
+    public:
+	Parser():
+	    fd(0)
+	{}
+
+	~Parser()
+	{
+	    if (fd) close(fd);
+	    
+	    std::vector<Object*>::iterator it;
+	    for(it=_objects.begin(); it!=_objects.end(); it++)
+		delete *it;
+	}
+
+	/**
+	 * @brief Parse a file
+	 */
+	void parse(const std::string& filename);
+
+	/**
+	 * @brief Write a PDF file with internal objects
+	 *
+	 * @param filename File path
+	 * @param update   Only append new objects if true
+	 *                 Write a new PDF file if false (not supported for now)
+	 */
+	void write(const std::string& filename, bool update=false);
+
+	/**
+	 * @brief Get internals (or parsed) objects
+	 */
+	std::vector<Object*>& objects() { return _objects; }
+
+	/**
+	 * @brief Add an object
+	 */
+	void addObject(Object* object) { _objects.push_back(object); }
+	
+    private:
+	void parseObject(std::string& token);
+	void parseXref();
+	void parseTrailer();
+
+	std::string nextToken(bool exceptionOnEOF=true);
+	
+	DataType* parseType(std::string& token, Object* object, std::map<std::string, DataType*>& dict);
+	void parseDictionary(Object* object, std::map<std::string, DataType*>& dict);
+	DataType* parseSignedNumber(std::string& token);
+	DataType* parseNumber(std::string& token);
+	DataType* parseNumberOrReference(std::string& token);
+	Array* parseArray(Object* object);
+	String* parseString();
+	HexaString* parseHexaString();
+	Stream* parseStream();
+	Name* parseName(std::string& token);
+
+	void writeUpdate(const std::string& filename);
+	
+	std::vector<Object*> _objects;
+	Object trailer;
+	off_t xrefOffset;
+	int fd;
+	off_t curOffset;
+    };
+}
+
+#endif
--- a/include/uPDFParser_common.h
+++ b/include/uPDFParser_common.h
@ -0,0 +1,72 @@
+#ifndef _UPDFPARSER_COMMON_HPP_
+#define _UPDFPARSER_COMMON_HPP_
+
+#include <sstream>
+#include <iomanip>
+#include <string.h>
+
+namespace uPDFParser
+{
+    enum PARSING_ERROR {
+	UNABLE_TO_OPEN_FILE = 1,
+	TRUNCATED_FILE,
+	INVALID_HEADER,
+	INVALID_LINE,
+	INVALID_FOOTER,
+	INVALID_DICTIONARY,
+	INVALID_NAME,
+	INVALID_BOOLEAN,
+	INVALID_NUMBER,
+	INVALID_STREAM,
+	INVALID_TOKEN,
+	INVALID_OBJECT,
+	INVALID_TRAILER,
+	INVALID_HEXASTRING,
+	NOT_IMPLEMENTED
+	
+    };
+
+    /**
+     * @brief Exception class
+     */
+    class Exception : public std::exception
+    {
+    public:
+
+	Exception(int code, const char* message, const char* file, int line):
+	    code(code), line(line), file(file)
+	{
+	    std::stringstream msg;
+	    msg << "Exception code : 0x" << std::setbase(16) << code << std::endl;
+	    msg << "Message        : " << message << std::endl;
+	    msg << "File           : " << file << ":" << std::setbase(10) << line << std::endl;
+	    fullmessage = strdup(msg.str().c_str());
+	}
+
+	Exception(const Exception& other)
+	{
+	    this->code = other.code;
+	    this->line = line;
+	    this->file = file;
+	    this->fullmessage = strdup(other.fullmessage);
+	}
+
+	~Exception()
+	{
+	    free(fullmessage);
+	}
+
+	const char * what () const throw () { return fullmessage; }
+	
+	int getErrorCode() {return code;}
+	
+	private:
+	int code, line;
+	const char* message, *file;
+	char* fullmessage;
+    };
+    
+#define EXCEPTION(code, message)					\
+    {std::stringstream __msg;__msg << message; throw uPDFParser::Exception(code, __msg.str().c_str(), __FILE__, __LINE__);}
+}
+#endif
--- a/include/uPDFTypes.h
+++ b/include/uPDFTypes.h
@ -0,0 +1,253 @@
+/*
+  Copyright 2021 Grégory Soutadé
+
+  This file is part of uPDFParser.
+
+  uPDFParser is free software: you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  uPDFParser is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with uPDFParser. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _UPDFTYPES_HPP_
+#define _UPDFTYPES_HPP_
+
+#include <map>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <sstream>
+
+namespace uPDFParser
+{
+    /**
+     * @brief Base class for PDF object type
+     * From https://resources.infosecinstitute.com/topic/pdf-file-format-basic-structure/
+     */
+    class DataType
+    {
+    public:
+	enum TYPE {BOOLEAN, INTEGER, REAL, NAME, STRING, HEXASTRING, REFERENCE, ARRAY, DICTIONARY, STREAM};
+
+	DataType(TYPE _type):
+	    _type(_type)
+	{}
+
+	virtual ~DataType() {}
+
+	/**
+	 * @brief Get current data type
+	 */
+	TYPE type() { return _type; }
+
+	/**
+	 * @brief String representation for serialization
+	 */
+	virtual std::string str() = 0;
+
+	/**
+	 * @brief Clone current object
+	 */
+	virtual DataType* clone() = 0;
+	
+    protected:
+	TYPE _type;
+    };
+    
+    class Boolean : public DataType
+    {
+    public:
+	Boolean(bool value):
+	    DataType(DataType::TYPE::BOOLEAN), _value(value)
+	{}
+
+	virtual DataType* clone() {return new Boolean(_value);}
+	bool value() {return _value;}
+	virtual std::string str() { return (_value)?" true":" false";}
+	
+    private:
+	bool _value;
+    };
+
+    class Integer : public DataType
+    {
+    public:
+	Integer(int value, bool _signed=false):
+	    DataType(DataType::TYPE::INTEGER), _value(value), _signed(_signed)
+	{}
+
+	virtual DataType* clone() {return new Integer(_value, _signed);}
+	int value() {return _value;}
+	virtual std::string str();
+
+    private:
+	int _value;
+	bool _signed;
+    };
+    
+    class Real : public DataType
+    {
+    public:
+	Real(float value, bool _signed=false):
+	    DataType(DataType::TYPE::REAL), _value(value), _signed(_signed)
+	{}
+
+	virtual DataType* clone() {return new Real(_value, _signed);}
+	float value() {return _value;}
+	virtual std::string str();
+	
+    private:
+	float _value;
+	bool _signed;
+    };
+
+    class Name : public DataType
+    {
+    public:
+	Name(const std::string&);
+
+	virtual DataType* clone() {return new Name(_value);}
+	std::string value() {
+	    const char* name = _value.c_str();
+	    return std::string(&name[1]);
+	}
+	virtual std::string str() { return _value;}
+	
+    private:
+	std::string _value;
+    };
+
+    class String : public DataType
+    {
+    public:
+	String(const std::string&);
+
+	virtual DataType* clone() {return new String(_value);}
+	std::string value() {return _value;}
+
+	// Escape '(' and ')' characters
+	virtual std::string str() {
+	    char prev = '\0';
+	    std::string res("(");
+
+	    for(unsigned int i=0; i<_value.size(); i++)
+	    {
+		if ((_value[i] == '(' || _value[i] == ')') &&
+		    prev != '\\')
+		    res += '\\';
+		res += _value[i];
+		prev = _value[i];
+	    }
+
+	    res += ")";
+	    return res;
+	}
+
+    private:
+	std::string _value;
+    };
+
+    class HexaString : public DataType
+    {
+    public:
+	HexaString(const std::string&);
+
+	virtual DataType* clone() {return new HexaString(_value);}
+	std::string value() {return _value;}
+	virtual std::string str() { return std::string("<") + _value + std::string(">");}
+
+    private:
+	std::string _value;
+    };
+
+    class Reference : public DataType
+    {
+    public:
+	Reference(int objectId, int generationNumber):
+	    DataType(DataType::TYPE::REFERENCE), objectId(objectId), generationNumber(generationNumber)
+	{}
+	
+	virtual DataType* clone() {return new Reference(objectId, generationNumber);}
+	int value() {return objectId;}
+	virtual std::string str() {
+	    std::stringstream res;
+	    res << " " << objectId << " " << generationNumber << " R";
+	    return res.str();
+	}
+
+    private:
+	int objectId, generationNumber;
+    };
+
+    class Array : public DataType
+    {
+    public:
+	Array():
+	    DataType(DataType::TYPE::ARRAY)
+	{}
+
+	void addData(DataType* data) {_value.push_back(data);}
+	
+	virtual DataType* clone() {
+	    Array* res = new Array();
+	    std::vector<DataType*>::iterator it;
+	    for(it=_value.begin(); it!=_value.end(); it++)
+		res->addData((*it)->clone());
+	    return res;
+	}
+	std::vector<DataType*>& value() {return _value;}
+	virtual std::string str();
+
+    private:
+	std::vector<DataType*> _value;
+    };
+
+    class Dictionary : public DataType
+    {
+    public:
+	Dictionary():
+	    DataType(DataType::TYPE::DICTIONARY)
+	{}
+
+	void addData(const std::string&, DataType*);
+
+	virtual DataType* clone() {
+	    Dictionary* res = new Dictionary();
+	    std::map<std::string, DataType*>::iterator it;
+	    for(it=_value.begin(); it!=_value.end(); it++)
+	    {
+		res->addData(it->first, it->second->clone());
+	    }
+	    return res;
+	}
+	std::map<std::string, DataType*>& value() {return _value;}
+	virtual std::string str();
+
+    private:
+	std::map<std::string, DataType*> _value;
+    };
+
+    class Stream : public DataType
+    {
+    public:
+	Stream(int startOffset, int endOffset):
+	    DataType(DataType::TYPE::STREAM), startOffset(startOffset),
+	    endOffset(endOffset)
+	{}
+	virtual DataType* clone() {return new Stream(startOffset, endOffset);}
+	virtual std::string str() { return "stream\nendstream\n";}
+
+    private:
+	int startOffset, endOffset;
+    };
+}
+
+#endif
--- a/src/uPDFParser.cpp
+++ b/src/uPDFParser.cpp
@ -0,0 +1,616 @@
+/*
+  Copyright 2021 Grégory Soutadé
+
+  This file is part of uPDFParser.
+
+  uPDFParser is free software: you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  uPDFParser is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with uPDFParser. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "uPDFParser.h"
+#include "uPDFParser_common.h"
+
+namespace uPDFParser
+{
+    std::string Object::str()
+    {
+	std::stringstream res;
+
+	res << _objectId << " " << _generationNumber << " obj\n";
+	res << _dictionary.str();
+
+	std::vector<DataType*>::iterator it;
+	for(it=_data.begin(); it!=_data.end(); it++)
+	    res << (*it)->str();
+
+	res << "endobj\n";
+
+	return res.str();
+    }
+
+    /**
+     * @brief Read data until '\n' or '\r' is found or buffer is full
+     */
+    static inline int readline(int fd, char* buffer, int size, bool exceptionOnEOF=true)
+    {
+	int res = 0;
+	char c;
+
+	buffer[0] = 0;
+	
+	for (;size;size--,res++)
+	{
+	    if (read(fd, &c, 1) != 1)
+	    {
+		if (exceptionOnEOF)
+		    EXCEPTION(TRUNCATED_FILE, "Unexpected end of file");
+		return -1;
+	    }
+
+	    if (c == '\n' || c == '\r')
+		break;
+
+	    buffer[res] = c;
+	}
+
+	if (size)
+	    buffer[res] = 0;
+	
+	return res;
+    }
+
+    /**
+     * @brief Read data until EOF, '\n' or '\r' is found
+     */
+    static inline void finishLine(int fd)
+    {
+	char c;
+	
+	while (1)
+	{
+	    if (read(fd, &c, 1) != 1)
+		break;
+
+	    if (c == '\n')
+		break;
+	}
+    }
+
+    /**
+     * @brief Find next token to analyze
+     */
+    std::string Parser::nextToken(bool exceptionOnEOF)
+    {
+	char c;
+	std::string res("");
+	int i;
+	static const char delims[] = " \t<>[]()+-/";
+	static const char start_delims[] = "<>[]()";
+	bool found = false;
+	
+	while (!found)
+	{
+	    if (read(fd, &c, 1) != 1)
+	    {
+		if (exceptionOnEOF)
+		    EXCEPTION(TRUNCATED_FILE, "Unexpected end of file");
+		break;
+	    }
+
+	    // Comment, skip line
+	    if (c == '%')
+	    {
+		finishLine(fd);
+		break;
+	    }
+
+	    // White character while empty result, continue
+	    if ((c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\0') && !res.size())
+		continue;
+
+	    // Quit on line return without lseek(fd, -1, SEEK_CUR)
+	    if (c == '\n' || c == '\r')
+	    {
+		if (res.size())
+		    break;
+		else
+		    continue;
+	    }
+
+	    if (res.size())
+	    {
+		// Push character until delimiter is found
+		for (i=0; i<(int)sizeof(delims); i++)
+		{
+		    if (c == delims[i])
+		    {
+			lseek(fd, -1, SEEK_CUR);
+			found = true;
+			break;
+		    }
+		}
+		
+		if (!found)
+		    res += c;
+	    }
+	    else
+	    {
+		curOffset = lseek(fd, 0, SEEK_CUR)-1;
+
+		// First character, is it a delimiter ?
+		for (i=0; i<(int)sizeof(start_delims); i++)
+		{
+		    if (c == start_delims[i])
+		    {
+			found = true;
+			break;
+		    }
+		}
+
+		res += c;
+	    }
+	}
+
+	// Double '>' and '<' to compute dictionary
+	if (res == ">" || res == "<")
+	{
+	    if (read(fd, &c, 1) == 1)
+	    {
+		if (c == res[0])
+		    res += c;
+		else
+		    lseek(fd, -1, SEEK_CUR);
+	    }
+	}
+	
+	return res;
+    }
+
+    void Parser::parseTrailer()
+    {
+	std::string token;
+	char buffer[10];
+
+	// std::cout << "Parse trailer" << std::endl;
+
+	token = nextToken();
+
+	if (token != "<<")
+	    EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
+
+	parseDictionary(&trailer, trailer.dictionary().value());
+
+	token = nextToken();
+	if (token != "startxref")
+	    EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
+
+	token = nextToken();
+	readline(fd, buffer, sizeof(buffer), false);
+	if (strncmp(buffer, "%%EOF", 5))
+	    EXCEPTION(INVALID_TRAILER, "Invalid trailer at offset " << curOffset);
+    }
+    
+    void Parser::parseXref()
+    {
+	std::string token;
+
+	// std::cout << "Parse xref" << std::endl;
+	xrefOffset = curOffset;
+
+	while (1)
+	{
+	    token = nextToken();
+
+	    if (token == "trailer")
+	    {
+		parseTrailer();
+		break;
+	    }
+	}
+    }
+
+    static DataType* tokenToNumber(std::string& token, char sign='\0')
+    {
+	int i;
+	float fvalue;
+	int ivalue;
+	
+	for(i=0; i<(int)token.size(); i++)
+	{
+	    if (token[i] == '.')
+	    {
+		if (i==0) token = std::string("0") + token;
+		fvalue = std::stof(token);
+		if (sign == '-')
+		    fvalue = -fvalue;
+		return new Real(fvalue, (sign!='\0'));
+	    }
+	}
+
+	ivalue = std::stoi(token);
+	if (sign == '-')
+	    ivalue = -ivalue;
+	
+	return new Integer(ivalue, (sign!='\0'));
+    }
+    
+    DataType* Parser::parseSignedNumber(std::string& token)
+    {
+	char sign = token[0];
+	token = std::string(&((token.c_str())[1]));
+	return tokenToNumber(token, sign);
+    }
+    
+    DataType* Parser::parseNumber(std::string& token)
+    {
+	return tokenToNumber(token);
+    }
+
+    DataType* Parser::parseNumberOrReference(std::string& token)
+    {
+	DataType* res = tokenToNumber(token);
+
+	if (res->type() == DataType::TYPE::REAL)
+	    return res;
+	
+	off_t offset = lseek(fd, 0, SEEK_CUR);
+	std::string token2 = nextToken();
+	std::string token3 = nextToken();
+
+	DataType* generationNumber = 0;
+	try
+	{
+	    generationNumber = tokenToNumber(token2);
+	}
+	catch (std::invalid_argument& e)
+	{
+	    lseek(fd, offset, SEEK_SET);
+	    return res;
+	}
+	
+	if ((generationNumber->type() != DataType::TYPE::INTEGER) ||
+	    token3.size() != 1 || token3[0] != 'R')
+	{
+	    delete generationNumber;
+	    lseek(fd, offset, SEEK_SET);
+	    return res;
+	}
+
+	DataType* res2 = new Reference(((Integer*)res)->value(),
+				       ((Integer*)generationNumber)->value());
+	delete res;
+	return res2;
+    }
+    
+    DataType* Parser::parseType(std::string& token, Object* object, std::map<std::string, DataType*>& dict)
+    {
+	DataType* value = 0;
+	Dictionary* _value = 0;
+
+	if (token == "<<")
+	{
+	    _value = new Dictionary();
+	    value = _value;
+	    parseDictionary(object, _value->value());
+	}
+	else if (token == "[")
+	    value = parseArray(object);
+	else if (token == "(")
+	    value = parseString();
+	else if (token == "<")
+	    value = parseHexaString();
+	else if (token == "stream")
+	    value = parseStream();
+	else if (token[0] >= '1' && token[0] <= '9')
+	    value = parseNumberOrReference(token);
+	else if (token[0] == '/')
+	    value = parseName(token);
+	else if (token[0] == '+' || token[0] == '-')
+	    value = parseSignedNumber(token);
+	else if (token[0] == '0' || token[0] == '.')
+	    value = parseNumber(token);
+	else if (token == "true")
+	    return new Boolean(true);
+	else if (token == "false")
+	    return new Boolean(false);
+	else
+	    EXCEPTION(INVALID_TOKEN, "Invalid token " << token << " at offset " << curOffset);
+
+	return value;
+    }
+
+    Array* Parser::parseArray(Object* object)
+    {
+	std::string token;
+	DataType* value;
+
+	Array* res = new Array();
+	
+	while (1)
+	{
+	    token = nextToken();
+
+	    if (token == "]")
+		break;
+
+	    value = parseType(token, object, object->dictionary().value());
+	    //std::cout << "Add " << value->str() << std::endl;
+	    res->addData(value);
+	}
+
+	return res;
+    }
+    
+    String* Parser::parseString()
+    {
+	std::string res("");
+	char c;
+	bool escaped = false;
+	
+	while (1)
+	{
+	    if (read(fd, &c, 1) != 1)
+		break;
+
+	    if (c == ')' && !escaped)
+		break;
+
+	    escaped = (c == '\\');
+
+	    res += c;
+	}
+
+	return new String(res);
+    }
+    
+    HexaString* Parser::parseHexaString()
+    {
+	std::string res("");
+	char c;
+	
+	while (1)
+	{
+	    if (read(fd, &c, 1) != 1)
+		break;
+
+	    if (c == '>')
+		break;
+
+	    res += c;
+	}
+
+	if ((res.size() % 2))
+	    EXCEPTION(INVALID_HEXASTRING, "Invalid hexa String at offset " << curOffset);
+	    
+	return new HexaString(res);
+    }
+
+    Stream* Parser::parseStream()
+    {
+	char buffer[1024];
+	off_t endOffset;
+
+	while (1)
+	{
+	    endOffset = lseek(fd, 0, SEEK_CUR);
+	    readline(fd, buffer, sizeof(buffer));
+	    if (!strncmp(buffer, "endstream", 9))
+		break;
+	}
+
+	return new Stream(curOffset, endOffset);
+    }
+    
+    Name* Parser::parseName(std::string& name)
+    {
+	if (!name.size() || name[0] != '/')
+	    EXCEPTION(INVALID_NAME, "Invalid Name at offset " << curOffset);
+
+	//std::cout << "Name " << name << std::endl;
+	return new Name(name);
+    }
+   
+    void Parser::parseDictionary(Object* object, std::map<std::string, DataType*>& dict)
+    {
+	std::string token;
+	Name* key;
+	DataType* value;
+
+	while (1)
+	{
+	    token = nextToken();
+	    if (token == ">>")
+		break;
+
+	    key = parseName(token);
+
+	    token = nextToken();
+	    if (token == ">>")
+	    {
+		dict[key->value()] = 0;
+		break;
+	    }
+
+	    value = parseType(token, object, dict);
+	    dict[key->value()] = value;
+	}
+    }
+    
+    void Parser::parseObject(std::string& token)
+    {
+	off_t offset;
+	int objectId, generationNumber;
+	Object* object;
+
+	offset = curOffset;
+	try
+	{
+	    objectId = std::stoi(token);
+	    token = nextToken();
+	    generationNumber = std::stoi(token);
+	}
+	catch(std::invalid_argument& e)
+	{
+	    EXCEPTION(INVALID_OBJECT, "Invalid object at offset " << curOffset);
+	}
+
+	token = nextToken();
+
+	if (token != "obj")
+	    EXCEPTION(INVALID_OBJECT, "Invalid object at offset " << curOffset);
+
+	std::cout << "New obj " << objectId << " " << generationNumber << std::endl;
+	
+	object = new Object(objectId, generationNumber, offset);
+	_objects.push_back(object);
+
+	while (1)
+	{
+	    token = nextToken();
+
+	    if (token == "endobj")
+		break;
+
+	    if (token == "<<")
+		parseDictionary(object, object->dictionary().value());
+	    else if (token[0] >= '1' && token[0] <= '9')
+	    {
+		DataType* _offset = tokenToNumber(token);
+		if (_offset->type() != DataType::TYPE::INTEGER)
+		    EXCEPTION(INVALID_OBJECT, "Invalid object at offset " << curOffset);
+		object->setIndirectOffset(((Integer*)_offset)->value());
+	    }
+	    else
+		parseType(token, object, object->dictionary().value());
+	}
+    }
+
+    void Parser::parse(const std::string& filename)
+    {
+	char buf[16];
+	std::string token;
+
+	if (fd)
+	    close(fd);
+
+	fd = open(filename.c_str(), O_RDONLY);
+	
+	if (fd <= 0)
+	    EXCEPTION(UNABLE_TO_OPEN_FILE, "Unable to open " << filename << " (%m)");
+
+	// Check %PDF at startup
+	readline(fd, buf, 4);
+	if (strncmp(buf, "%PDF", 4))
+	    EXCEPTION(INVALID_HEADER, "Invalid PDF header");
+	finishLine(fd);
+
+	curOffset = lseek(fd, 0, SEEK_CUR);
+
+	// // Check %%EOF at then end
+	// lseek(fd, -5, SEEK_END);
+	// readline(fd, buf, 5);
+	// if (strncmp(buf, "%%EOF", 5))
+	//     EXCEPTION(INVALID_FOOTER, "Invalid PDF footer");
+
+	lseek(fd, curOffset, SEEK_SET);
+
+	while (1)
+	{
+	    token = nextToken(false);
+
+	    if (!token.size())
+		break;
+
+	    if (token == "xref")
+		parseXref();
+	    else if (token[0] >= '1' && token[0] <= '9')
+		parseObject(token);
+	    else
+		EXCEPTION(INVALID_LINE, "Invalid Line at offset " << curOffset);
+	}
+	
+	close(fd);
+    }
+
+    void Parser::writeUpdate(const std::string& filename)
+    {
+	int newFd = open(filename.c_str(), O_WRONLY|O_APPEND|O_CREAT, S_IRUSR|S_IWUSR);
+
+	if (newFd <= 0)
+	    EXCEPTION(UNABLE_TO_OPEN_FILE, "Unable to open " << filename << " (%m)");
+
+	::write(newFd, "\r", 1);
+
+	std::stringstream xref;
+	int nbNewObjects = 0;
+
+	xref << std::setfill('0');
+	xref << "xref\n";
+	
+	std::vector<Object*>::iterator it;
+	for(it=_objects.begin(); it!=_objects.end(); it++)
+	{
+	    if (!(*it)->isNew())
+		continue;
+	    nbNewObjects ++;
+	    std::string objStr = (*it)->str();
+	    curOffset = lseek(newFd, 0, SEEK_CUR);
+	    ::write(newFd, objStr.c_str(), objStr.size());
+	    xref << std::setw(0) << (*it)->objectId() << " 1\n";
+	    xref << std::setw(10) << curOffset << " " << std::setw(5) << (*it)->generationNumber() << " n\r\n"; // Here \r seems important 
+	}
+
+	if (!nbNewObjects)
+	{
+	    close(newFd);
+	    return;
+	}
+
+	off_t newXrefOffset = lseek(newFd, 0, SEEK_CUR);
+
+	std::string xrefStr = xref.str();
+	::write(newFd, xrefStr.c_str(), xrefStr.size());
+
+	if (trailer.hasKey("Prev"))
+	    delete trailer["Prev"];
+	
+	trailer["Prev"] = new Integer((int)xrefOffset);
+
+	std::string trailerStr = trailer.dictionary().str();
+	::write(newFd, "trailer\n", 8);
+	::write(newFd, trailerStr.c_str(), trailerStr.size());
+
+	std::stringstream startxref;
+	startxref << "startxref\n" << newXrefOffset << "\n%%EOF";
+	
+	std::string startxrefStr = startxref.str();
+	::write(newFd, startxrefStr.c_str(), startxrefStr.size());
+	
+	close(newFd);
+    }
+    
+    void Parser::write(const std::string& filename, bool update)
+    {
+	if (update)
+	    return writeUpdate(filename);
+	else
+	    EXCEPTION(NOT_IMPLEMENTED, "Full write not implemented");
+    }
+
+}
--- a/src/uPDFTypes.cpp
+++ b/src/uPDFTypes.cpp
@ -0,0 +1,105 @@
+/*
+  Copyright 2021 Grégory Soutadé
+
+  This file is part of uPDFParser.
+
+  uPDFParser is free software: you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  uPDFParser is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU Lesser General Public License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with uPDFParser. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "uPDFTypes.h"
+#include "uPDFParser_common.h"
+
+namespace uPDFParser
+{
+    Name::Name(const std::string& name):
+	DataType(DataType::TYPE::NAME)
+    {
+	_value = name;
+    }
+
+    String::String(const std::string& value):
+	DataType(DataType::TYPE::STRING)
+    {
+	_value = value;
+    }
+
+    HexaString::HexaString(const std::string& value):
+	DataType(DataType::TYPE::HEXASTRING)
+    {
+	_value = value;
+    }
+
+    std::string Integer::str()
+    {
+	std::string sign("");
+	if (_signed)
+	{
+	    if (_value >= 0)
+		sign = "+";
+	    else
+		sign = "-";
+	}
+
+	return " " + sign + std::to_string(_value);
+    }
+    
+    std::string Real::str()
+    {
+	std::string sign("");
+	if (_signed)
+	{
+	    if (_value >= 0)
+		sign = "+";
+	    else
+		sign = "-";
+	}
+
+	return " " + sign + std::to_string(_value);
+    }
+
+    std::string Array::str()
+    {
+	std::string res("[");
+	std::vector<DataType*>::iterator it;
+
+	for(it = _value.begin(); it!=_value.end(); it++)
+	{
+	    if (res.size() > 1)
+		res += " ";
+	    res += (*it)->str();
+	}
+	    
+	return res + std::string("]");
+    }
+
+    void Dictionary::addData(const std::string& key, DataType* value)
+    {
+	_value[key] = value;
+    }
+    
+    std::string Dictionary::str()
+    {
+	std::string res("<<");
+	std::map<std::string, DataType*>::iterator it;
+
+	for(it = _value.begin(); it!=_value.end(); it++)
+	{
+	    res += std::string("/") + it->first;
+	    if (it->second)
+		res += it->second->str();
+	}
+	    
+	return res + std::string(">>\n"); 
+   }
+}