diff options
author | yzrh <yzrh@noema.org> | 2023-01-01 18:58:43 +0000 |
---|---|---|
committer | yzrh <yzrh@noema.org> | 2023-01-01 18:58:43 +0000 |
commit | cde014cffbe2e8d94de144008ad00bbccbb3a8ab (patch) | |
tree | 8fc347facc93788bdc5cd30dba721716e82ab2fd /src | |
parent | 9019a184494e6fc220bcc1eb8f47f33fe0f3e506 (diff) | |
download | melon-cde014cffbe2e8d94de144008ad00bbccbb3a8ab.tar.gz melon-cde014cffbe2e8d94de144008ad00bbccbb3a8ab.tar.zst |
Improve PDF parser.
Signed-off-by: yzrh <yzrh@noema.org>
Diffstat (limited to 'src')
-rw-r--r-- | src/pdf_parser.c | 44 |
1 files changed, 26 insertions, 18 deletions
diff --git a/src/pdf_parser.c b/src/pdf_parser.c index b4470f9..54c7fb4 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, yzrh <yzrh@noema.org> + * Copyright (c) 2020-2023, yzrh <yzrh@noema.org> * * SPDX-License-Identifier: Apache-2.0 */ @@ -19,12 +19,12 @@ static void * _memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1) { const char whitespace[6] = { + ' ', '\r', '\n', '\f', '\t', - '\0', - ' ' + '\0' }; char tmp[s1 + 1]; @@ -34,7 +34,7 @@ _memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1) for (int i = 0; i < 6; i++) { tmp[s1] = whitespace[i]; - if((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL) + if ((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL) return ret; } @@ -57,13 +57,18 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) end = ftell(*fp); fseek(*fp, cur, SEEK_SET); - int head = 0; - int tail = 0; + long head = 0; + long tail = 0; char *pos; char *tmp; for (;;) { - fread(buf, size_buf, 1, *fp); + if (cur + size_buf < end) { + fread(buf, size_buf, 1, *fp); + } else { + fread(buf, end - cur, 1, *fp); + memset(buf + end - cur, 0, size_buf - end + cur); + } if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL) head = cur + (pos - buf) + 7; @@ -72,8 +77,8 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) /* We need to check if it is the object stored in stream */ while (memcmp(pos + 7, "\r\nendstream", 11) == 0 && - (tmp = _memmem_whitespace(pos + 6, - size_buf - (pos - buf) - 6, + (tmp = _memmem_whitespace(pos + 7, + size_buf - (pos - buf) - 7, "endobj", 6)) != NULL) pos = tmp; @@ -102,13 +107,16 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf) ptr->address = head; ptr->size = tail - head; - fseek(*fp, tail + 6, SEEK_SET); + fseek(*fp, tail + 7, SEEK_SET); head = tail = 0; + } else if (head > 0 && tail > 0) { + fseek(*fp, head, SEEK_SET); + tail = 0; } else { - fseek(*fp, -6, SEEK_CUR); + fseek(*fp, -7, SEEK_CUR); } - if ((cur = ftell(*fp)) + 6 >= end) + if ((cur = ftell(*fp)) + 7 >= end) break; } @@ -159,11 +167,11 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) * A dictionary object may have nested dictionary, * but it should not be in a stream */ - while ((tmp = _memmem_whitespace(tail + 2, - ptr->size - (tail - buf) - 2, + while ((tmp = _memmem_whitespace(tail + 3, + ptr->size - (tail - buf) - 3, ">>", 2)) != NULL && - memmem(tail + 2, - ptr->size - (tail - buf) - 2, + memmem(tail + 3, + ptr->size - (tail - buf) - 3, "stream\r\n", 8) == NULL) tail = tmp; @@ -190,8 +198,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) while (_memmem_whitespace(tail, ptr->size - (tail - buf), "endobj", 6) != NULL && - (tmp = _memmem_whitespace(tail + 9, - ptr->size - (tail - buf) - 9, + (tmp = _memmem_whitespace(tail + 10, + ptr->size - (tail - buf) - 10, "endstream", 9)) != NULL) tail = tmp; |