From cde014cffbe2e8d94de144008ad00bbccbb3a8ab Mon Sep 17 00:00:00 2001 From: yzrh Date: Sun, 1 Jan 2023 18:58:43 +0000 Subject: Improve PDF parser. Signed-off-by: yzrh --- src/pdf_parser.c | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) (limited to 'src/pdf_parser.c') diff --git a/src/pdf_parser.c b/src/pdf_parser.c index b4470f9..54c7fb4 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, yzrh + * Copyright (c) 2020-2023, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -19,12 +19,12 @@ static void * _memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1) { const char whitespace[6] = { + ' ', '\r', '\n', '\f', '\t', - '\0', - ' ' + '\0' }; char tmp[s1 + 1]; @@ -34,7 +34,7 @@ _memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1) for (int i = 0; i < 6; i++) { tmp[s1] = whitespace[i]; - if((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL) + if ((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL) return ret; } @@ -57,13 +57,18 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) end = ftell(*fp); fseek(*fp, cur, SEEK_SET); - int head = 0; - int tail = 0; + long head = 0; + long tail = 0; char *pos; char *tmp; for (;;) { - fread(buf, size_buf, 1, *fp); + if (cur + size_buf < end) { + fread(buf, size_buf, 1, *fp); + } else { + fread(buf, end - cur, 1, *fp); + memset(buf + end - cur, 0, size_buf - end + cur); + } if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL) head = cur + (pos - buf) + 7; @@ -72,8 +77,8 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) /* We need to check if it is the object stored in stream */ while (memcmp(pos + 7, "\r\nendstream", 11) == 0 && - (tmp = _memmem_whitespace(pos + 6, - size_buf - (pos - buf) - 6, + (tmp = _memmem_whitespace(pos + 7, + size_buf - (pos - buf) - 7, "endobj", 6)) != NULL) pos = tmp; @@ -102,13 +107,16 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) ptr->address = head; ptr->size = tail - head; - fseek(*fp, tail + 6, SEEK_SET); + fseek(*fp, tail + 7, SEEK_SET); head = tail = 0; + } else if (head > 0 && tail > 0) { + fseek(*fp, head, SEEK_SET); + tail = 0; } else { - fseek(*fp, -6, SEEK_CUR); + fseek(*fp, -7, SEEK_CUR); } - if ((cur = ftell(*fp)) + 6 >= end) + if ((cur = ftell(*fp)) + 7 >= end) break; } @@ -159,11 +167,11 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) * A dictionary object may have nested dictionary, * but it should not be in a stream */ - while ((tmp = _memmem_whitespace(tail + 2, - ptr->size - (tail - buf) - 2, + while ((tmp = _memmem_whitespace(tail + 3, + ptr->size - (tail - buf) - 3, ">>", 2)) != NULL && - memmem(tail + 2, - ptr->size - (tail - buf) - 2, + memmem(tail + 3, + ptr->size - (tail - buf) - 3, "stream\r\n", 8) == NULL) tail = tmp; @@ -190,8 +198,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) while (_memmem_whitespace(tail, ptr->size - (tail - buf), "endobj", 6) != NULL && - (tmp = _memmem_whitespace(tail + 9, - ptr->size - (tail - buf) - 9, + (tmp = _memmem_whitespace(tail + 10, + ptr->size - (tail - buf) - 10, "endstream", 9)) != NULL) tail = tmp; -- cgit v1.2.3