From 4d1b3f00901066d0f1306bcf3416e9be5fc81b1c Mon Sep 17 00:00:00 2001 From: Matthew Laukala Date: Thu, 8 Feb 2018 09:49:51 -0800 Subject: [PATCH 01/13] cumulative fixes --- src/PdfSharp-gdi/PdfSharp-gdi.csproj | 22 ++ src/PdfSharp/Pdf.IO/Lexer.cs | 284 ++++++++++++++----- src/PdfSharp/Pdf.IO/Parser.cs | 408 +++++++++++++++++++++++---- src/PdfSharp/Pdf.IO/PdfReader.cs | 2 +- src/PdfSharp/Pdf.IO/PdfWriter.cs | 2 +- 5 files changed, 596 insertions(+), 122 deletions(-) diff --git a/src/PdfSharp-gdi/PdfSharp-gdi.csproj b/src/PdfSharp-gdi/PdfSharp-gdi.csproj index 941eee61..4ab868f8 100644 --- a/src/PdfSharp-gdi/PdfSharp-gdi.csproj +++ b/src/PdfSharp-gdi/PdfSharp-gdi.csproj @@ -98,6 +98,28 @@ none AllRules.ruleset + + true + bin\x64\Debug\ + TRACE;DEBUG;GDI;UseGdiObjects + 285212672 + 4096 + x64 + default + prompt + AllRules.ruleset + + + bin\x64\Release\ + TRACE;GDI;UseGdiObjects + 285212672 + bin\Release\PdfSharp-gdi.xml + true + true + 4096 + x64 + AllRules.ruleset + System diff --git a/src/PdfSharp/Pdf.IO/Lexer.cs b/src/PdfSharp/Pdf.IO/Lexer.cs index 5bff4193..40241e9b 100644 --- a/src/PdfSharp/Pdf.IO/Lexer.cs +++ b/src/PdfSharp/Pdf.IO/Lexer.cs @@ -76,21 +76,43 @@ public int Position } } - /// - /// Reads the next token and returns its type. If the token starts with a digit, the parameter - /// testReference specifies how to treat it. If it is false, the lexer scans for a single integer. - /// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference, - /// the token is set to the object ID followed by the generation number separated by a blank - /// (the 'R' is omitted from the token). - /// - // /// Indicates whether to test the next token if it is a reference. - public Symbol ScanNextToken() + /// + /// Reads the next token and returns its type. If the token starts with a digit, the parameter + /// testReference specifies how to treat it. If it is false, the lexer scans for a single integer. + /// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference, + /// the token is set to the object ID followed by the generation number separated by a blank + /// (the 'R' is omitted from the token). + /// + // /// Indicates whether to test the next token if it is a reference. + public Symbol ScanNextToken() + { + return ScanNextToken(out int location); + } + + /// + /// Reads the next token and returns its type. If the token starts with a digit, the parameter + /// testReference specifies how to treat it. If it is false, the lexer scans for a single integer. + /// If it is true, the lexer checks if the digit is the prefix of a reference. If it is a reference, + /// the token is set to the object ID followed by the generation number separated by a blank + /// (the 'R' is omitted from the token). + /// + // /// The start position of the next token. + public Symbol ScanNextToken(out int position) + { + Symbol symbol = Symbol.None; + if (!TryScanNextToken(out symbol, out position)) + ParserDiagnostics.HandleUnexpectedCharacter(_nextChar); + return symbol; + } + + public bool TryScanNextToken(out Symbol symbol, out int position) { Again: _token = new StringBuilder(); char ch = MoveToNonWhiteSpace(); - switch (ch) + position = Position; + switch (ch) { case '%': // Eat comments, the parser doesn't handle them @@ -99,75 +121,90 @@ public Symbol ScanNextToken() goto Again; case '/': - return _symbol = ScanName(); - - //case 'R': - // if (Lexer.IsWhiteSpace(nextChar)) - // { - // ScanNextChar(); - // return Symbol.R; - // } - // break; - + symbol = _symbol = ScanName(); + return true; + case '+': //TODO is it so easy? case '-': - return _symbol = ScanNumber(); + symbol = _symbol = ScanNumber(); + return true; case '(': - return _symbol = ScanLiteralString(); + symbol = _symbol = ScanLiteralString(); + return true; case '[': ScanNextChar(true); - return _symbol = Symbol.BeginArray; + symbol = _symbol = Symbol.BeginArray; + return true; case ']': ScanNextChar(true); - return _symbol = Symbol.EndArray; + symbol = _symbol = Symbol.EndArray; + return true; case '<': if (_nextChar == '<') { ScanNextChar(true); ScanNextChar(true); - return _symbol = Symbol.BeginDictionary; + symbol = _symbol = Symbol.BeginDictionary; + return true; } - return _symbol = ScanHexadecimalString(); + symbol = _symbol = ScanHexadecimalString(); + return true; case '>': if (_nextChar == '>') { ScanNextChar(true); ScanNextChar(true); - return _symbol = Symbol.EndDictionary; + symbol = _symbol = Symbol.EndDictionary; + return true; } - ParserDiagnostics.HandleUnexpectedCharacter(_nextChar); - break; + + symbol = _symbol = Symbol.None; + return false; case '.': - return _symbol = ScanNumber(); + symbol = _symbol = ScanNumber(); + return true; } if (char.IsDigit(ch)) #if true_ - return ScanNumberOrReference(); + symbol = ScanNumberOrReference(); + return true; #else if (PeekReference()) - return _symbol = ScanNumber(); + { + symbol = _symbol = ScanNumber(); + return true; + } else - return _symbol = ScanNumber(); + { + symbol = _symbol = ScanNumber(); + return true; + } #endif if (char.IsLetter(ch)) - return _symbol = ScanKeyword(); + { + symbol = _symbol = ScanKeyword(); + return true; + } if (ch == Chars.EOF) - return _symbol = Symbol.Eof; + { + symbol = _symbol = Symbol.Eof; + return true; + } // #??? - - ParserDiagnostics.HandleUnexpectedCharacter(ch); - return _symbol = Symbol.None; + + symbol = _symbol = Symbol.None; + return false; } - + /// /// Reads the raw content of a stream. /// @@ -190,7 +227,26 @@ public byte[] ReadStream(int length) else pos = _idxChar + 1; - _pdfSteam.Position = pos; + // Verify stream length and resolve if bad + string post_stream = ReadRawString(pos + length, ("endstream").Length); + if (post_stream != "endstream") + { + // find the first endstream occurrence + // first check to see if it is within the specified stream length. + int endstream_idx = post_stream.IndexOf("endstream", StringComparison.Ordinal); + if (endstream_idx == -1) + { + post_stream = ReadRawString(pos, _pdfLength - pos); + endstream_idx = post_stream.IndexOf("endstream", StringComparison.Ordinal); + } + + if (endstream_idx != -1) + { + length = endstream_idx; + } + } + + _pdfSteam.Position = pos; byte[] bytes = new byte[length]; int read = _pdfSteam.Read(bytes, 0, length); Debug.Assert(read == length); @@ -247,20 +303,67 @@ public Symbol ScanName() while (true) { char ch = AppendAndScanNextChar(); - if (IsWhiteSpace(ch) || IsDelimiter(ch) || ch == Chars.EOF) - return _symbol = Symbol.Name; - if (ch == '#') - { - ScanNextChar(true); - char[] hex = new char[2]; - hex[0] = _currChar; - hex[1] = _nextChar; - ScanNextChar(true); - // TODO Check syntax - ch = (char)(ushort)int.Parse(new string(hex), NumberStyles.AllowHexSpecifier); - _currChar = ch; - } + if (ch == '#') + { + ScanNextChar(true); + char[] hex = new char[2]; + hex[0] = _currChar; + hex[1] = _nextChar; + ScanNextChar(true); + // TODO Check syntax + ch = (char)(ushort)int.Parse(new string(hex), NumberStyles.AllowHexSpecifier); + _currChar = ch; + continue; + } + + if (IsNameOrCommentDelimiter(ch) || ch == Chars.EOF) + { + return _symbol = Symbol.Name; + } + + if (IsWhiteSpace(ch)) + { + //TODO: Check that the white space is valid. + return _symbol = Symbol.Name; + } + + //Handle invalid delimiters + switch (ch) + { + case '(': + //TODO: Handle invalid delimiters + return _symbol = Symbol.Name; + case ')': + //TODO: Handle invalid delimiters + return _symbol = Symbol.Name; + case '<': + //TODO: Handle invalid delimiters + return _symbol = Symbol.Name; + case '>': + //TODO: Handle invalid delimiters + return _symbol = Symbol.Name; + case '[': + //TODO: Not Complete + if (IsWhiteSpace(_nextChar) || IsDelimiter(_nextChar) || char.IsNumber(_nextChar) || _nextChar == '-' || PeekArrayKeyword()) + { + return _symbol = Symbol.Name; + } + break; + case ']': + //TODO: Not Complete + if (IsWhiteSpace(_nextChar) || IsDelimiter(_nextChar) || _nextChar == Chars.EOF) + { + return _symbol = Symbol.Name; + } + break; + case '{': + //TODO: Handle invalid delimiters + return _symbol = Symbol.Name; + case '}': + //TODO: Handle invalid delimiters + return _symbol = Symbol.Name; + } } } @@ -634,20 +737,22 @@ internal char ScanNextChar(bool handleCRLF) // Treat single CR as LF. _currChar = Chars.LF; } - } + //Console.WriteLine(); + } } + //Console.Write(_currChar); return _currChar; } - ///// - ///// Resets the current token to the empty string. - ///// - //void ClearToken() - //{ - // _token.Length = 0; - //} + ///// + ///// Resets the current token to the empty string. + ///// + //void ClearToken() + //{ + // _token.Length = 0; + //} - bool PeekReference() + bool PeekReference() { // A Reference has the form "nnn mmm R". The implementation of the parser used a // reduce/shift algorithm in the first place. But this case is the only one we need to @@ -695,6 +800,39 @@ bool PeekReference() return false; } + bool PeekArrayKeyword() + { + StringBuilder token = _token; + int position = Position; + ScanNextChar(true); + + //Pretty sure I want to skip any non white space + char ch = MoveToNonWhiteSpace(); + + //reset the _token + _token = new StringBuilder(); + + while (!IsWhiteSpace(ch) && !IsDelimiter(ch)) + { + ch = AppendAndScanNextChar(); + } + + bool b_is_keyword = false; + switch (_token.ToString()) + { + case "null": + case "true": + case "false": + b_is_keyword = true; + break; + } + + Position = position; + _token = token; + + return b_is_keyword; + } + /// /// Appends current character to the token and reads next one. /// @@ -882,10 +1020,24 @@ internal static bool IsDelimiter(char ch) return false; } - /// - /// Gets the length of the PDF output. - /// - public int PdfLength + /// + /// Indicates whether the specified character is a PDF delimiter character. + /// + internal static bool IsNameOrCommentDelimiter(char ch) + { + switch (ch) + { + case '/': + case '%': + return true; + } + return false; + } + + /// + /// Gets the length of the PDF output. + /// + public int PdfLength { get { return _pdfLength; } } diff --git a/src/PdfSharp/Pdf.IO/Parser.cs b/src/PdfSharp/Pdf.IO/Parser.cs index 07b353a9..5cac7376 100644 --- a/src/PdfSharp/Pdf.IO/Parser.cs +++ b/src/PdfSharp/Pdf.IO/Parser.cs @@ -80,6 +80,22 @@ public int MoveToObject(PdfObjectID objectID) return _lexer.Position = position; } + /// + /// Tries to set PDF input stream position to the specified object. + /// + public bool TryMoveToObject(PdfObjectID objectID, out int position) + { + position = _document._irefTable[objectID].Position; + if (position == -1) + { + position = _lexer.Position; + return false; + } + + _lexer.Position = position; + return true; + } + public Symbol Symbol { get { return _lexer.Symbol; } @@ -118,7 +134,8 @@ public PdfObject ReadObject(PdfObject pdfObject, PdfObjectID objectID, bool incl int generationNumber = objectID.GenerationNumber; if (!fromObjecStream) { - MoveToObject(objectID); + if (!TryMoveToObject(objectID, out int position)) + return null; objectNumber = ReadInteger(); generationNumber = ReadInteger(); } @@ -261,49 +278,35 @@ public PdfObject ReadObject(PdfObject pdfObject, PdfObjectID objectID, bool incl ParserDiagnostics.HandleUnexpectedToken(_lexer.Token); break; } - symbol = ScanNextToken(); - if (symbol == Symbol.BeginStream) + + int revert_pos = _lexer.Position; + + ParserState state = SaveState(); + TryScanNextToken(out symbol); + if (symbol == Symbol.BeginStream || symbol == Symbol.None) { + if (symbol == Symbol.None) + { + // Failed to get a proper symbol + // probably missing "stream" token + RestoreState(state); + } + PdfDictionary dict = (PdfDictionary)pdfObject; Debug.Assert(checkForStream, "Unexpected stream..."); -#if true_ - ReadStream(dict); -#else + int length = GetStreamLength(dict); byte[] bytes = _lexer.ReadStream(length); -#if true_ - if (dict.Elements.GetString("/Filter") == "/FlateDecode") - { - if (dict.Elements["/Subtype"] == null) - { - try - { - byte[] decoded = Filtering.FlateDecode.Decode(bytes); - if (decoded.Length == 0) - goto End; - string pageContent = Filtering.FlateDecode.DecodeToString(bytes); - if (pageContent.Length > 100) - pageContent = pageContent.Substring(pageContent.Length - 100); - pageContent.GetType(); - bytes = decoded; - dict.Elements.Remove("/Filter"); - dict.Elements.SetInteger("/Length", bytes.Length); - } - catch - { - } - } - End: ; - } -#endif + PdfDictionary.PdfStream stream = new PdfDictionary.PdfStream(bytes, dict); dict.Stream = stream; - ReadSymbol(Symbol.EndStream); - symbol = ScanNextToken(); -#endif + + revert_pos = _lexer.Position; + while ((symbol = ScanNextToken()) == Symbol.EndStream); } - if (!fromObjecStream && symbol != Symbol.EndObj) - ParserDiagnostics.ThrowParserException(PSSR.UnexpectedToken(_lexer.Token)); + if (!fromObjecStream && symbol != Symbol.EndObj) + _lexer.Position = revert_pos; + return pdfObject; } @@ -322,7 +325,7 @@ private void ReadStream(PdfDictionary dict) Debug.Assert(dict.Stream == null, "Dictionary already has a stream."); dict.Stream = stream; ReadSymbol(Symbol.EndStream); - ScanNextToken(); + while (ScanNextToken() == Symbol.EndStream); } // HACK: Solve problem more general. @@ -339,11 +342,47 @@ private int GetStreamLength(PdfDictionary dict) if (reference != null) { ParserState state = SaveState(); - object length = ReadObject(null, reference.ObjectID, false, false); + object pdf_obj = ReadObject(null, reference.ObjectID, false, false); RestoreState(state); - int len = ((PdfIntegerObject)length).Value; - dict.Elements["/Length"] = new PdfInteger(len); - return len; + + + + + int len = -1; + if (pdf_obj is PdfIntegerObject length_obj) + { + len = length_obj.Value; + } + // For whatever reason, ReadObject() did not return a valid PdfIntegerObject + else + { + // Read 1k chunks until we find an "endstream" symbol + string content = ""; + int read_pos = _lexer.Position; + int se = -1; + while (true) + { + int read_len = Math.Min(_lexer.PdfLength - read_pos, 1024); + content += _lexer.ReadRawString(read_pos, read_len); + read_pos += 1024; + + se = content.IndexOf("endstream", StringComparison.Ordinal); + if (se != -1) + { + len = se - 2; // By spec, the stream should start on a new line. remove crlf chars from the count. + break; + } + + if (read_pos >= _lexer.PdfLength) + break; + } + } + + if (len != -1) + { + dict.Elements["/Length"] = new PdfInteger(len); + return len; + } } throw new InvalidOperationException("Cannot retrieve stream length."); } @@ -537,10 +576,41 @@ private void ParseObject(Symbol stop) //case Symbol.StartXRef: //case Symbol.Eof: default: - ParserDiagnostics.HandleUnexpectedToken(_lexer.Token); - SkipCharsUntil(stop); - return; - } + // Any Keyword can be treated as a literal string. + switch (stop) + { + case Symbol.EndArray: + // Arrays are space delimited. + while (true) + { + char ch = _lexer.AppendAndScanNextChar(); + if (Lexer.IsWhiteSpace(ch) || ch == Chars.EOF || ch == Chars.BracketRight) + { + _stack.Shift(new PdfString(_lexer.Token, PdfStringFlags.RawEncoding)); + break; + } + } + break; + case Symbol.EndDictionary: + // Dictionaries are key value pairs where key must be a name. + while (true) + { + char ch = _lexer.AppendAndScanNextChar(); + if (ch == Chars.Slash || ch == Chars.Greater) + { + _stack.Shift(new PdfString(_lexer.Token, PdfStringFlags.RawEncoding)); + break; + } + } + break; + default: + ParserDiagnostics.HandleUnexpectedToken(_lexer.Token); + SkipCharsUntil(stop); + break; + } + + return; + } } ParserDiagnostics.ThrowParserException("Unexpected end of file."); // TODO L10N using PSSR. } @@ -549,6 +619,16 @@ private Symbol ScanNextToken() { return _lexer.ScanNextToken(); } + + private Symbol ScanNextToken(out int position) + { + return _lexer.ScanNextToken(out position); + } + + private bool TryScanNextToken(out Symbol symbol) + { + return _lexer.TryScanNextToken(out symbol, out int position); + } private Symbol ScanNextToken(out string token) { @@ -1031,12 +1111,61 @@ internal PdfTrailer ReadTrailer() throw new Exception("The StartXRef table could not be found, the file cannot be opened."); ReadSymbol(Symbol.StartXRef); - _lexer.Position = ReadInteger(); - - // Read all trailers. - while (true) + int startxref = _lexer.Position = ReadInteger(); + + // Must be before the first 'goto valid_xref;' statement. + int xref_offset = 0; + + // Check for valid startxref + if (IsValidXref()) + { + goto valid_xref; + } + + // If we reach this point, we have an invalid startxref + // First look for bytes preceding "%PDF-". Some pdf producers ignore these. + if (length >= 1024) + { + // "%PDF-" should be in this range + string header = _lexer.ReadRawString(0, 1024); + idx = header.IndexOf("%PDF-", StringComparison.Ordinal); + } + else + { + string header = _lexer.ReadRawString(0, length); + idx = header.IndexOf("%PDF-", StringComparison.Ordinal); + } + + if (idx > 0) + { + //_lexer.ByteOffset = idx; + _lexer.Position = startxref + idx; + if (IsValidXref()) + { + xref_offset = idx; + goto valid_xref; + } + } + + _lexer.Position = startxref; + // Check for valid startxref + if (!IsValidXref()) + { + PdfTrailer trailer = TryRecreateXRefTableAndTrailer(_document._irefTable); + if (trailer == null) + throw new Exception("Could not recreate the xref table or trailer."); + + _document._trailer = trailer; + return _document._trailer; + } + + valid_xref: + _lexer.Position = startxref + xref_offset; + + // Read all trailers. + while (true) { - PdfTrailer trailer = ReadXRefTableAndTrailer(_document._irefTable); + PdfTrailer trailer = ReadXRefTableAndTrailer(_document._irefTable, xref_offset); // 1st trailer seems to be the best. if (_document._trailer == null) _document._trailer = trailer; @@ -1052,9 +1181,180 @@ internal PdfTrailer ReadTrailer() } /// - /// Reads cross reference table(s) and trailer(s). + /// Checks that the current _lexer location is a valid xref. /// - private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) + /// + private bool IsValidXref() + { + int length = _lexer.PdfLength; + int position = _lexer.Position; + // Make sure not inside a stream. + + string content = ""; + int content_pos = position; + while (true) + { + // look for stream and endstream in 1k chunks. + int read_length = Math.Min(1024, length - content_pos); + content += _lexer.ReadRawString(content_pos, read_length); + + int ss = content.IndexOf("stream", StringComparison.Ordinal); + int es = content.IndexOf("endstream", StringComparison.Ordinal); + int eof = content.IndexOf("%%EOF", StringComparison.Ordinal); + + if (ss != es) + { + if (ss == -1) + { + if (eof != -1 && eof < es) + break; + else + return false; + } + else if (es == -1) + break; + else if (ss < es) + break; + else if (ss > es) + { + if (eof != -1 && eof < ss && eof < es) + break; + else + return false; + } + } + + if (eof != -1) + break; + + content_pos = content_pos + read_length; + if (content_pos + read_length >= length) + { + // reached the end of the document without finding either. + break; + } + } + + _lexer.Position = position; + + Symbol symbol = ScanNextToken(); + if (symbol == Symbol.XRef) + { + return true; + } + + if (symbol == Symbol.Integer) + { + // Just because we have an integer, doesn't mean the startxref is actually valid + if (ScanNextToken() == Symbol.Integer && ScanNextToken() == Symbol.Obj) + { + return true; + } + } + + return false; + } + + private PdfTrailer TryRecreateXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) + { + // Let's first check for a trailer + int length = _lexer.PdfLength; + + int trail_idx; + if (length >= 1024) + { + string trail = _lexer.ReadRawString(length - 1024, 1024); + trail_idx = trail.LastIndexOf("trailer", StringComparison.Ordinal); + _lexer.Position = length - 1024 + trail_idx; + } + else + { + string trail = _lexer.ReadRawString(0, length); + trail_idx = trail.LastIndexOf("trailer", StringComparison.Ordinal); + _lexer.Position = trail_idx; + } + + if (trail_idx == -1) + return null; //TODO: Look for compressed xref table that should contain the trailer + + ReadSymbol(Symbol.Trailer); + ReadSymbol(Symbol.BeginDictionary); + PdfTrailer trailer = new PdfTrailer(_document); + ReadDictionary(trailer, false); + + // Recreate the xref table. + // + // When symbol == Symbol.Obj + // [0] - generation + // [1] - id + TokenInfo[] token_stack = new TokenInfo[2]; + _lexer.Position = 0; + while (true) + { + Symbol symbol = ScanNextToken(out int position); + if (symbol == Symbol.Eof) + break; + + // we need to skip over streams entirely + if (symbol == Symbol.BeginStream) + { + // We're not reading any data from the object so wee need to find endstream + int pos = _lexer.Position; + string trail = ""; + int trail_pos = pos; + while (true) + { + // look for endstream in 1k chunks. + int trail_length = Math.Min(1024, length - trail_pos); + trail += _lexer.ReadRawString(trail_pos, trail_length); + int stop = trail.IndexOf("endstream", StringComparison.Ordinal); + if (stop != -1) + { + _lexer.Position = stop + pos; + break; + } + + trail_pos = trail_pos + trail_length; + if (trail_pos + trail_length >= length) + { + // No endstream was found. + throw new Exception("endstream not found."); + } + } + } + + if (symbol == Symbol.Obj && + token_stack[0].Symbol == Symbol.Integer && + token_stack[1].Symbol == Symbol.Integer) + { + PdfObjectID objectID = new PdfObjectID(token_stack[1].Number, token_stack[0].Number); + if (!xrefTable.Contains(objectID)) + xrefTable.Add(new PdfReference(objectID, token_stack[1].Position)); + //ReadObject(null, objectID, false, false); // Can't do this because the object value will never be set after + //SkipCharsUntil(Symbol.EndObj); // Can't do this because streams will cause exceptions + } + + token_stack[1] = token_stack[0]; + TokenInfo token_info = new TokenInfo { Symbol = symbol, Position = position }; + if (symbol == Symbol.Integer) + token_info.Number = _lexer.TokenToInteger; + token_stack[0] = token_info; + } + + return trailer; + } + + struct TokenInfo + { + public int Position; + public Symbol Symbol; + public int Number; + } + + /// + /// Reads cross reference table(s) and trailer(s). + /// + private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable, int xrefOffset) { Debug.Assert(xrefTable != null); @@ -1072,7 +1372,7 @@ private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) int length = ReadInteger(); for (int id = start; id < start + length; id++) { - int position = ReadInteger(); + int position = ReadInteger() + xrefOffset; int generation = ReadInteger(); ReadSymbol(Symbol.Keyword); string token = _lexer.Token; @@ -1129,7 +1429,7 @@ private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) } return null; } - + /// /// Checks the x reference table entry. Returns true if everything is correct. /// Return false if the keyword "obj" was found, but ID or Generation are incorrect. diff --git a/src/PdfSharp/Pdf.IO/PdfReader.cs b/src/PdfSharp/Pdf.IO/PdfReader.cs index 08a9f965..2a2abc6f 100644 --- a/src/PdfSharp/Pdf.IO/PdfReader.cs +++ b/src/PdfSharp/Pdf.IO/PdfReader.cs @@ -439,7 +439,7 @@ public static PdfDocument Open(Stream stream, string password, PdfDocumentOpenMo { Debug.WriteLine(ex.Message); // 4STLA rethrow exception to notify caller. - throw; + //throw; } } else diff --git a/src/PdfSharp/Pdf.IO/PdfWriter.cs b/src/PdfSharp/Pdf.IO/PdfWriter.cs index 08071c5c..feafe404 100644 --- a/src/PdfSharp/Pdf.IO/PdfWriter.cs +++ b/src/PdfSharp/Pdf.IO/PdfWriter.cs @@ -245,7 +245,7 @@ public void Write(PdfName value) case '[': case ']': case '#': - break; + break; default: pdf.Append(name[idx]); From fcdfd3d60a4b9b94586fc90e4bdc659ba73e2383 Mon Sep 17 00:00:00 2001 From: Matthew Laukala Date: Wed, 30 May 2018 11:29:03 -0700 Subject: [PATCH 02/13] Added fix for 0 position xref entries not being marked as free. --- src/PdfSharp/Pdf.IO/Parser.cs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/PdfSharp/Pdf.IO/Parser.cs b/src/PdfSharp/Pdf.IO/Parser.cs index 5cac7376..e6a3a16e 100644 --- a/src/PdfSharp/Pdf.IO/Parser.cs +++ b/src/PdfSharp/Pdf.IO/Parser.cs @@ -1382,6 +1382,14 @@ private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable, int // Skip unused entries. if (token != "n") continue; + + // Mac OS X 10.12.6 Quartz PDFContext fails to mark 0 position entries as free. + // According to spec, we could skip anything less than 8 (e.g. '%PDF-1.n' where n is a digit between 0 and 7 must be the header of a file) + // but anything between 0 and 8 (1-7) could be the indication of a much larger problem. + // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.5.2 + // Skip 0 position entries. + if (position == 0) + continue; #if true //!!!new 2018-03-14 begin // Check if the object at the address has the correct ID and generation. From a3c77c21eedb7d1f8550b990410e173434aaad8f Mon Sep 17 00:00:00 2001 From: Matthew Laukala Date: Mon, 4 Jun 2018 08:48:54 -0700 Subject: [PATCH 03/13] Proper fix for invalid startxref. --- src/PdfSharp/Pdf.Advanced/PdfTrailer.cs | 33 +++- src/PdfSharp/Pdf.IO/Lexer.cs | 5 +- src/PdfSharp/Pdf.IO/Parser.cs | 234 +++++++++++------------- src/PdfSharp/Pdf.IO/PdfReader.cs | 12 +- 4 files changed, 149 insertions(+), 135 deletions(-) diff --git a/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs b/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs index cd56e94f..3810ca9a 100644 --- a/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs +++ b/src/PdfSharp/Pdf.Advanced/PdfTrailer.cs @@ -75,7 +75,7 @@ public PdfTrailer(PdfCrossReferenceStream trailer) if (id != null) Elements.SetValue(Keys.ID, id); } - + public int Size { get { return Elements.GetInteger(Keys.Size); } @@ -218,6 +218,37 @@ internal void Finish() _document._irefTable.IsUnderConstruction = false; } + /// + /// Constructs the PdfTrailer from a document. + /// + /// the parser used to read the file. + internal void ConstructFromDocument(Parser parser) + { + // TODO - May need to also search for encryption related trailer info + PdfCrossReferenceTable xrefTable = _document._irefTable; + Elements.SetInteger(Keys.Size, xrefTable.ObjectTable.Count); + + // find the root. + PdfDictionary rootToUse = null; + foreach (var reference in xrefTable.AllReferences) + { + PdfObject obj = parser.ReadObject(null, reference.ObjectID, false, false); + if (obj is PdfDictionary dObj) + { + if (dObj.Elements[PdfCatalog.Keys.Type] as PdfName == "/Catalog") + { + if (rootToUse == null) + rootToUse = dObj; + else if (dObj.ObjectID.GenerationNumber > rootToUse.ObjectID.GenerationNumber) + rootToUse = dObj; + } + } + } + + if (rootToUse != null) + Elements.SetReference(Keys.Root, rootToUse); + } + /// /// Predefined keys of this dictionary. /// diff --git a/src/PdfSharp/Pdf.IO/Lexer.cs b/src/PdfSharp/Pdf.IO/Lexer.cs index 40241e9b..e4933620 100644 --- a/src/PdfSharp/Pdf.IO/Lexer.cs +++ b/src/PdfSharp/Pdf.IO/Lexer.cs @@ -117,7 +117,10 @@ public bool TryScanNextToken(out Symbol symbol, out int position) case '%': // Eat comments, the parser doesn't handle them //return symbol = ScanComment(); - ScanComment(); + symbol = _symbol = ScanComment(); + // Do not eat EOF + if (symbol == Symbol.Eof) + return true; goto Again; case '/': diff --git a/src/PdfSharp/Pdf.IO/Parser.cs b/src/PdfSharp/Pdf.IO/Parser.cs index e6a3a16e..94bc7d20 100644 --- a/src/PdfSharp/Pdf.IO/Parser.cs +++ b/src/PdfSharp/Pdf.IO/Parser.cs @@ -1110,62 +1110,31 @@ internal PdfTrailer ReadTrailer() if (idx == -1) throw new Exception("The StartXRef table could not be found, the file cannot be opened."); - ReadSymbol(Symbol.StartXRef); - int startxref = _lexer.Position = ReadInteger(); - - // Must be before the first 'goto valid_xref;' statement. - int xref_offset = 0; - - // Check for valid startxref - if (IsValidXref()) - { - goto valid_xref; - } - - // If we reach this point, we have an invalid startxref - // First look for bytes preceding "%PDF-". Some pdf producers ignore these. - if (length >= 1024) - { - // "%PDF-" should be in this range - string header = _lexer.ReadRawString(0, 1024); - idx = header.IndexOf("%PDF-", StringComparison.Ordinal); - } - else - { - string header = _lexer.ReadRawString(0, length); - idx = header.IndexOf("%PDF-", StringComparison.Ordinal); - } - - if (idx > 0) - { - //_lexer.ByteOffset = idx; - _lexer.Position = startxref + idx; - if (IsValidXref()) - { - xref_offset = idx; - goto valid_xref; - } - } - - _lexer.Position = startxref; + Symbol s = ReadSymbol(Symbol.StartXRef); + _lexer.Position = ReadInteger(); + + // Producer: iText1.3.1 by lowagie.com (based on itext-paulo-154) + // Problem: certificate data added to the start of file. Invalid startxref byte offset + // Fix: We could search for the a valid xref table but all byte offsets are probably incorrect. + // Probably best to just recreate the xref table. + // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.5.5 + // Check for valid startxref if (!IsValidXref()) { - PdfTrailer trailer = TryRecreateXRefTableAndTrailer(_document._irefTable); - if (trailer == null) + PdfTrailer trailer; + bool bSuccess = TryRecreateXRefTableAndTrailer(out trailer, _document); + if (!bSuccess) throw new Exception("Could not recreate the xref table or trailer."); _document._trailer = trailer; return _document._trailer; } - - valid_xref: - _lexer.Position = startxref + xref_offset; - + // Read all trailers. while (true) { - PdfTrailer trailer = ReadXRefTableAndTrailer(_document._irefTable, xref_offset); + PdfTrailer trailer = ReadXRefTableAndTrailer(_document._irefTable); // 1st trailer seems to be the best. if (_document._trailer == null) _document._trailer = trailer; @@ -1186,114 +1155,101 @@ internal PdfTrailer ReadTrailer() /// private bool IsValidXref() { - int length = _lexer.PdfLength; int position = _lexer.Position; - // Make sure not inside a stream. - - string content = ""; - int content_pos = position; - while (true) + try { - // look for stream and endstream in 1k chunks. - int read_length = Math.Min(1024, length - content_pos); - content += _lexer.ReadRawString(content_pos, read_length); - - int ss = content.IndexOf("stream", StringComparison.Ordinal); - int es = content.IndexOf("endstream", StringComparison.Ordinal); - int eof = content.IndexOf("%%EOF", StringComparison.Ordinal); + Symbol symbol = ScanNextToken(); + if (symbol == Symbol.XRef) // xref table + { + _lexer.Position = position; + return true; + } - if (ss != es) + if (symbol == Symbol.Integer) // Linearization parameter dictionary { - if (ss == -1) - { - if (eof != -1 && eof < es) - break; - else - return false; - } - else if (es == -1) - break; - else if (ss < es) - break; - else if (ss > es) + // Just because we have an integer, doesn't mean the startxref is actually valid + if (ScanNextToken() == Symbol.Integer && ScanNextToken() == Symbol.Obj) { - if (eof != -1 && eof < ss && eof < es) - break; - else - return false; + _lexer.Position = position; + return true; } } - if (eof != -1) - break; - - content_pos = content_pos + read_length; - if (content_pos + read_length >= length) - { - // reached the end of the document without finding either. - break; - } + _lexer.Position = position; + return false; } + catch + { + _lexer.Position = position; + return false; + } + } - _lexer.Position = position; + private bool TryRecreateXRefTableAndTrailer(out PdfTrailer trailer, PdfDocument document) + { + PdfCrossReferenceTable xrefTable = document._irefTable; + trailer = null; + int length = _lexer.PdfLength; - Symbol symbol = ScanNextToken(); - if (symbol == Symbol.XRef) + // because some pdf producers put random info before the header, we need to find a proper starting position. + // i.e. Producer: iText1.3.1 by lowagie.com (based on itext-paulo-154) + int startIdx = -1; + string contents = ""; + for (int i = 0, pos = 0; startIdx == -1 && pos < length; i++, pos = 1024 * i) { - return true; + int len = Math.Min(1024, length - pos); + contents = $"{contents}{_lexer.ReadRawString(pos, len)}"; + startIdx = contents.IndexOf("%PDF-1.", StringComparison.Ordinal); } - if (symbol == Symbol.Integer) + if (startIdx == -1) + return false; + + // Don't look past the last %%EOF marker + int endIdx = -1; + contents = ""; + for (int i = 1; endIdx == -1; i++) { - // Just because we have an integer, doesn't mean the startxref is actually valid - if (ScanNextToken() == Symbol.Integer && ScanNextToken() == Symbol.Obj) + int pos = length - (1024 * i); + int len = 1024; + + if (pos < 0) { - return true; + len = len + pos; + pos = 0; } - } - return false; - } - - private PdfTrailer TryRecreateXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) - { - // Let's first check for a trailer - int length = _lexer.PdfLength; - - int trail_idx; - if (length >= 1024) - { - string trail = _lexer.ReadRawString(length - 1024, 1024); - trail_idx = trail.LastIndexOf("trailer", StringComparison.Ordinal); - _lexer.Position = length - 1024 + trail_idx; - } - else - { - string trail = _lexer.ReadRawString(0, length); - trail_idx = trail.LastIndexOf("trailer", StringComparison.Ordinal); - _lexer.Position = trail_idx; - } + contents = $"{_lexer.ReadRawString(pos, len)}{contents}"; + endIdx = contents.LastIndexOf("%%EOF", StringComparison.Ordinal); + if (endIdx != -1) + endIdx = length - contents.Length + endIdx; - if (trail_idx == -1) - return null; //TODO: Look for compressed xref table that should contain the trailer + if (pos == 0) + break; + } - ReadSymbol(Symbol.Trailer); - ReadSymbol(Symbol.BeginDictionary); - PdfTrailer trailer = new PdfTrailer(_document); - ReadDictionary(trailer, false); + if (endIdx == -1) + return false; + endIdx = endIdx + 5; // This should be where Eof char is + // Recreate the xref table. // // When symbol == Symbol.Obj // [0] - generation // [1] - id TokenInfo[] token_stack = new TokenInfo[2]; - _lexer.Position = 0; + + _lexer.Position = startIdx; while (true) { Symbol symbol = ScanNextToken(out int position); - if (symbol == Symbol.Eof) - break; + if (symbol == Symbol.Eof) + { + // Check if it's the last EOF + if (_lexer.Position >= endIdx) + break; // This is the end of the file. + } // we need to skip over streams entirely if (symbol == Symbol.BeginStream) @@ -1327,6 +1283,7 @@ private PdfTrailer TryRecreateXRefTableAndTrailer(PdfCrossReferenceTable xrefTab token_stack[0].Symbol == Symbol.Integer && token_stack[1].Symbol == Symbol.Integer) { + // TODO:: Do we only need the most recent revision? PdfObjectID objectID = new PdfObjectID(token_stack[1].Number, token_stack[0].Number); if (!xrefTable.Contains(objectID)) xrefTable.Add(new PdfReference(objectID, token_stack[1].Position)); @@ -1334,14 +1291,37 @@ private PdfTrailer TryRecreateXRefTableAndTrailer(PdfCrossReferenceTable xrefTab //SkipCharsUntil(Symbol.EndObj); // Can't do this because streams will cause exceptions } - token_stack[1] = token_stack[0]; + token_stack[1] = token_stack[0]; TokenInfo token_info = new TokenInfo { Symbol = symbol, Position = position }; if (symbol == Symbol.Integer) token_info.Number = _lexer.TokenToInteger; token_stack[0] = token_info; } - return trailer; + // find the root. +// foreach (var reference in xrefTable.AllReferences) +// { +// PdfObject obj = ReadObject(null, reference.ObjectID, false, false); +// if (obj is PdfDictionary dObj) +// { +// if (dObj.Elements[PdfCatalog.Keys.Type] as PdfName == "/Catalog") +// { +// PdfCatalog catalog = new PdfCatalog(dObj); +// } +// } +// } + + + + + + + + + trailer = new PdfTrailer(_document); + trailer.ConstructFromDocument(this); + + return true; } struct TokenInfo @@ -1354,7 +1334,7 @@ struct TokenInfo /// /// Reads cross reference table(s) and trailer(s). /// - private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable, int xrefOffset) + private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable) { Debug.Assert(xrefTable != null); @@ -1372,7 +1352,7 @@ private PdfTrailer ReadXRefTableAndTrailer(PdfCrossReferenceTable xrefTable, int int length = ReadInteger(); for (int id = start; id < start + length; id++) { - int position = ReadInteger() + xrefOffset; + int position = ReadInteger(); int generation = ReadInteger(); ReadSymbol(Symbol.Keyword); string token = _lexer.Token; diff --git a/src/PdfSharp/Pdf.IO/PdfReader.cs b/src/PdfSharp/Pdf.IO/PdfReader.cs index 2a2abc6f..b6a665e2 100644 --- a/src/PdfSharp/Pdf.IO/PdfReader.cs +++ b/src/PdfSharp/Pdf.IO/PdfReader.cs @@ -279,7 +279,7 @@ public static PdfDocument Open(Stream stream, string password, PdfDocumentOpenMo public static PdfDocument Open(Stream stream, string password, PdfDocumentOpenMode openmode, PdfPasswordProvider passwordProvider) { PdfDocument document; - try + //try { Lexer lexer = new Lexer(stream); document = new PdfDocument(lexer); @@ -500,11 +500,11 @@ public static PdfDocument Open(Stream stream, string password, PdfDocumentOpenMo document._irefTable.CheckConsistence(); } } - catch (Exception ex) - { - Debug.WriteLine(ex.Message); - throw; - } +// catch (Exception ex) +// { +// Debug.WriteLine(ex.Message); +// throw; +// } return document; } From dc610808bcd0ef30b5ef2e31b5397f8c725b3f9f Mon Sep 17 00:00:00 2001 From: Matthew Laukala Date: Mon, 19 Nov 2018 09:09:35 -0800 Subject: [PATCH 04/13] Fixed a bug where ScanName() would not correctly identify the BeginDictionary delimiter if the next char was a '.' --- src/PdfSharp/Pdf.IO/Lexer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/PdfSharp/Pdf.IO/Lexer.cs b/src/PdfSharp/Pdf.IO/Lexer.cs index e4933620..da4f2eeb 100644 --- a/src/PdfSharp/Pdf.IO/Lexer.cs +++ b/src/PdfSharp/Pdf.IO/Lexer.cs @@ -348,7 +348,7 @@ public Symbol ScanName() return _symbol = Symbol.Name; case '[': //TODO: Not Complete - if (IsWhiteSpace(_nextChar) || IsDelimiter(_nextChar) || char.IsNumber(_nextChar) || _nextChar == '-' || PeekArrayKeyword()) + if (IsWhiteSpace(_nextChar) || IsDelimiter(_nextChar) || char.IsNumber(_nextChar) || _nextChar == '.' || _nextChar == '-' || PeekArrayKeyword()) { return _symbol = Symbol.Name; } From 3f657067191c272b6d6efc4dcb4bb604b1969e37 Mon Sep 17 00:00:00 2001 From: Matthew Laukala Date: Tue, 2 Apr 2019 12:10:59 -0700 Subject: [PATCH 05/13] Resolved bad end stream checks that slowed stream object reading. --- src/PdfSharp/Pdf.IO/Lexer.cs | 75 +++++++++++++++++++++++++++++------- 1 file changed, 62 insertions(+), 13 deletions(-) diff --git a/src/PdfSharp/Pdf.IO/Lexer.cs b/src/PdfSharp/Pdf.IO/Lexer.cs index da4f2eeb..8372328f 100644 --- a/src/PdfSharp/Pdf.IO/Lexer.cs +++ b/src/PdfSharp/Pdf.IO/Lexer.cs @@ -230,23 +230,72 @@ public byte[] ReadStream(int length) else pos = _idxChar + 1; - // Verify stream length and resolve if bad - string post_stream = ReadRawString(pos + length, ("endstream").Length); - if (post_stream != "endstream") - { + // Producer: + // Problem: Incorrect stream length + // Fix: Find the endstream keyword and measure the length + // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.3.8 + + // Producer: + // Problem: Not all pdf producers add a eol marker before endstream + // Fix: double check for endstream without the eol marker + // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.3.8 + + // Producer: + // Problem: Some pdf producers replace the eol marker with a carriage return + // Fix: double check for endstream without the eol marker + // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.3.8 + + // Verify stream length and resolve if bad + string nendstream = $"{'\n'}endstream"; + string rendstream = $"{'\r'}endstream"; + string endstream = "endstream"; + + string postStream = ReadRawString(pos + length, nendstream.Length); + + bool bValid = postStream == nendstream || + postStream == rendstream || + postStream.StartsWith(endstream); // Not all pdf producers add a eol marker before endstream + + if (!bValid) + { + string[] endstreamValues = { nendstream, rendstream, endstream }; + + int IndexOfEndStream(string val) + { + // Find the smallest value + int offset = -1; + + foreach (var es in endstreamValues) + { + int o = val.IndexOf(es, StringComparison.Ordinal); + if (o < offset || offset == -1) + { + offset = o; + } + } + + return offset; + } + + // find the first endstream occurrence // first check to see if it is within the specified stream length. - int endstream_idx = post_stream.IndexOf("endstream", StringComparison.Ordinal); - if (endstream_idx == -1) - { - post_stream = ReadRawString(pos, _pdfLength - pos); - endstream_idx = post_stream.IndexOf("endstream", StringComparison.Ordinal); - } + int idxOffset = IndexOfEndStream(postStream); + if (idxOffset != -1) + { + length = length + idxOffset; + } - if (endstream_idx != -1) + if (idxOffset == -1) { - length = endstream_idx; - } + // TODO:: read in chunks + postStream = ReadRawString(pos, _pdfLength - pos); + idxOffset = IndexOfEndStream(postStream); + if (idxOffset != -1) + { + length = idxOffset; + } + } } _pdfSteam.Position = pos; From 2f5ed1ebc83306bf620f0dd3499dd50734eb1b55 Mon Sep 17 00:00:00 2001 From: Matthew Laukala Date: Tue, 30 Apr 2019 11:13:04 -0700 Subject: [PATCH 06/13] Added DEBUG conditional around a try catch block. Helps with debugging to not catch and rethrow. --- src/PdfSharp/Pdf/PdfPages.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/PdfSharp/Pdf/PdfPages.cs b/src/PdfSharp/Pdf/PdfPages.cs index de2ee441..eb612221 100644 --- a/src/PdfSharp/Pdf/PdfPages.cs +++ b/src/PdfSharp/Pdf/PdfPages.cs @@ -612,8 +612,9 @@ PdfDictionary[] GetKids(PdfReference iref, PdfPage.InheritedValues values, PdfDi PdfPage.InheritValues(kid, values); return new PdfDictionary[] { kid }; } - - if (string.IsNullOrEmpty(type)) + + // If it has kids, it's logically not going to be type page. + if (string.IsNullOrEmpty(type) && !kid.Elements.ContainsKey("/Kids")) { // Type is required. If type is missing, assume it is "/Page" and hope it will work. // TODO Implement a "Strict" mode in PDFsharp and don't do this in "Strict" mode. From 6ddd84a5d150ebf0ff1748167b7ad4b47497967f Mon Sep 17 00:00:00 2001 From: Matthew Laukala Date: Tue, 30 Apr 2019 11:14:53 -0700 Subject: [PATCH 07/13] GetKids no longer assumes page if missing type when dictionary contains the /Kids key. --- src/PdfSharp/Pdf.IO/PdfReader.cs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/PdfSharp/Pdf.IO/PdfReader.cs b/src/PdfSharp/Pdf.IO/PdfReader.cs index b6a665e2..21eaef5b 100644 --- a/src/PdfSharp/Pdf.IO/PdfReader.cs +++ b/src/PdfSharp/Pdf.IO/PdfReader.cs @@ -279,7 +279,9 @@ public static PdfDocument Open(Stream stream, string password, PdfDocumentOpenMo public static PdfDocument Open(Stream stream, string password, PdfDocumentOpenMode openmode, PdfPasswordProvider passwordProvider) { PdfDocument document; - //try +#if !DEBUG + try +#endif { Lexer lexer = new Lexer(stream); document = new PdfDocument(lexer); @@ -500,11 +502,13 @@ public static PdfDocument Open(Stream stream, string password, PdfDocumentOpenMo document._irefTable.CheckConsistence(); } } -// catch (Exception ex) -// { -// Debug.WriteLine(ex.Message); -// throw; -// } +#if !DEBUG + catch (Exception ex) + { + Debug.WriteLine(ex.Message); + throw; + } +#endif return document; } From 8c45b47d3747fe93a7df87ea581838bb533c5250 Mon Sep 17 00:00:00 2001 From: Matthew Laukala Date: Mon, 22 Jul 2019 07:41:19 -0700 Subject: [PATCH 08/13] When checking for a valid stream length, now also checks for endstream with CRLF. --- src/PdfSharp/Pdf.IO/Lexer.cs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/PdfSharp/Pdf.IO/Lexer.cs b/src/PdfSharp/Pdf.IO/Lexer.cs index 8372328f..4607f94a 100644 --- a/src/PdfSharp/Pdf.IO/Lexer.cs +++ b/src/PdfSharp/Pdf.IO/Lexer.cs @@ -244,16 +244,18 @@ public byte[] ReadStream(int length) // Problem: Some pdf producers replace the eol marker with a carriage return // Fix: double check for endstream without the eol marker // https://www.adobe.com/content/dam/acom/en/devnet/acrobat/pdfs/PDF32000_2008.pdf 7.3.8 - + // Verify stream length and resolve if bad string nendstream = $"{'\n'}endstream"; string rendstream = $"{'\r'}endstream"; + string rnendstream = $"{'\r'}{'\n'}endstream"; string endstream = "endstream"; - string postStream = ReadRawString(pos + length, nendstream.Length); + string postStream = ReadRawString(pos + length, rnendstream.Length); - bool bValid = postStream == nendstream || - postStream == rendstream || + bool bValid = postStream.StartsWith(nendstream) || + postStream.StartsWith(rendstream) || + postStream.StartsWith(rnendstream) || postStream.StartsWith(endstream); // Not all pdf producers add a eol marker before endstream if (!bValid) From 3b91dd0a224360a4d3e32e23022ed18c6141cea3 Mon Sep 17 00:00:00 2001 From: mlaukala Date: Tue, 12 Nov 2019 16:04:25 -0800 Subject: [PATCH 09/13] Set up CI with Azure Pipelines Auto build and nuget [skip ci] --- azure-pipelines.yml | 50 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 azure-pipelines.yml diff --git a/azure-pipelines.yml b/azure-pipelines.yml new file mode 100644 index 00000000..066d959f --- /dev/null +++ b/azure-pipelines.yml @@ -0,0 +1,50 @@ +# ASP.NET Core (.NET Framework) +# Build and test ASP.NET Core projects targeting the full .NET Framework. +# Add steps that publish symbols, save build artifacts, and more: +# https://docs.microsoft.com/azure/devops/pipelines/languages/dotnet-core + +trigger: +- Release + +pool: + vmImage: 'windows-latest' + +variables: + solution: '**/*.sln' + buildPlatform: 'Any CPU' + buildConfiguration: 'Release' + +steps: +- task: NuGetToolInstaller@1 + +- task: NuGetCommand@2 + inputs: + restoreSolution: '$(solution)' + +- task: VSBuild@1 + inputs: + solution: '$(solution)' + msbuildArgs: '/p:DeployOnBuild=true /p:WebPublishMethod=Package /p:PackageAsSingleFile=true /p:SkipInvalidConfigurations=true /p:DesktopBuildPackageLocation="$(build.artifactStagingDirectory)\WebApp.zip" /p:DeployIisAppPath="Default Web Site"' + platform: '$(buildPlatform)' + configuration: '$(buildConfiguration)' + +- task: VSTest@2 + inputs: + platform: '$(buildPlatform)' + configuration: '$(buildConfiguration)' + +- task: NuGetCommand@2 + inputs: + command: 'pack' + packagesToPack: '**/*.csproj' + versioningScheme: 'byPrereleaseNumber' + majorVersion: '1' + minorVersion: '0' + patchVersion: '0' + +- task: NuGetCommand@2 + inputs: + command: 'push' + packagesToPush: '$(Build.ArtifactStagingDirectory)/**/*.nupkg;!$(Build.ArtifactStagingDirectory)/**/*.symbols.nupkg' + nuGetFeedType: 'internal' + publishVstsFeed: 'b23e5b36-79b9-4a22-a765-20dec00e216d' From 47089a1ed44c42e17e95a23a00e8ceb4a99992f5 Mon Sep 17 00:00:00 2001 From: mlaukala Date: Tue, 12 Nov 2019 18:56:53 -0800 Subject: [PATCH 10/13] Update azure-pipelines.yml for Azure Pipelines --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 066d959f..06de390b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -4,7 +4,7 @@ # https://docs.microsoft.com/azure/devops/pipelines/languages/dotnet-core trigger: -- Release +- master pool: vmImage: 'windows-latest' From 1bb0cff35815c2c54f24be678c079b7a9c4601e2 Mon Sep 17 00:00:00 2001 From: mlaukala Date: Tue, 12 Nov 2019 18:59:22 -0800 Subject: [PATCH 11/13] Update azure-pipelines.yml for Azure Pipelines --- azure-pipelines.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 06de390b..066d959f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -4,7 +4,7 @@ # https://docs.microsoft.com/azure/devops/pipelines/languages/dotnet-core trigger: -- master +- Release pool: vmImage: 'windows-latest' From 3f5e669287c9b9d100dda17539f1adfe36de7df0 Mon Sep 17 00:00:00 2001 From: Matthew Laukala Date: Wed, 18 Dec 2019 15:49:20 -0800 Subject: [PATCH 12/13] Fixed another out of spec pdf issue. --- src/PdfSharp/Pdf.IO/Lexer.cs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/PdfSharp/Pdf.IO/Lexer.cs b/src/PdfSharp/Pdf.IO/Lexer.cs index 4607f94a..c0e54b70 100644 --- a/src/PdfSharp/Pdf.IO/Lexer.cs +++ b/src/PdfSharp/Pdf.IO/Lexer.cs @@ -410,7 +410,18 @@ public Symbol ScanName() { return _symbol = Symbol.Name; } - break; + + string tkn = Token; + + int position = Position; + ScanNextChar(true); + MoveToNonWhiteSpace(); + bool isRef = PeekReference(); + Position = position; + _token = new StringBuilder(tkn); + if (isRef) + return _symbol = Symbol.Name; + break; case '{': //TODO: Handle invalid delimiters return _symbol = Symbol.Name; From 8145e7bbe5d1ebac2fb7cafca2f3e0d56a0e914b Mon Sep 17 00:00:00 2001 From: Matthew Laukala Date: Wed, 8 Sep 2021 15:22:35 -0700 Subject: [PATCH 13/13] Fixed a casting where the types were incompatible. Out of spec pdf fix. --- src/PdfSharp/Pdf/PdfDictionary.cs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/PdfSharp/Pdf/PdfDictionary.cs b/src/PdfSharp/Pdf/PdfDictionary.cs index 428140a1..b24138af 100644 --- a/src/PdfSharp/Pdf/PdfDictionary.cs +++ b/src/PdfSharp/Pdf/PdfDictionary.cs @@ -652,8 +652,10 @@ public PdfRectangle GetRectangle(string key, bool create) array.Elements.GetReal(2), array.Elements.GetReal(3)); this[key] = value; } - else - value = (PdfRectangle)obj; + else if (obj is PdfRectangle rectangle) + { + value = rectangle; + } return value; }