aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authoryzrh <yzrh@noema.org>2023-01-01 18:58:43 +0000
committeryzrh <yzrh@noema.org>2023-01-01 18:58:43 +0000
commitcde014cffbe2e8d94de144008ad00bbccbb3a8ab (patch)
tree8fc347facc93788bdc5cd30dba721716e82ab2fd /src
parent9019a184494e6fc220bcc1eb8f47f33fe0f3e506 (diff)
downloadmelon-cde014cffbe2e8d94de144008ad00bbccbb3a8ab.tar.gz
melon-cde014cffbe2e8d94de144008ad00bbccbb3a8ab.tar.zst
Improve PDF parser.
Signed-off-by: yzrh <yzrh@noema.org>
Diffstat (limited to 'src')
-rw-r--r--src/pdf_parser.c44
1 files changed, 26 insertions, 18 deletions
diff --git a/src/pdf_parser.c b/src/pdf_parser.c
index b4470f9..54c7fb4 100644
--- a/src/pdf_parser.c
+++ b/src/pdf_parser.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
+ * Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@@ -19,12 +19,12 @@ static void *
_memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
{
const char whitespace[6] = {
+ ' ',
'\r',
'\n',
'\f',
'\t',
- '\0',
- ' '
+ '\0'
};
char tmp[s1 + 1];
@@ -34,7 +34,7 @@ _memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
for (int i = 0; i < 6; i++) {
tmp[s1] = whitespace[i];
- if((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL)
+ if ((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL)
return ret;
}
@@ -57,13 +57,18 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
end = ftell(*fp);
fseek(*fp, cur, SEEK_SET);
- int head = 0;
- int tail = 0;
+ long head = 0;
+ long tail = 0;
char *pos;
char *tmp;
for (;;) {
- fread(buf, size_buf, 1, *fp);
+ if (cur + size_buf < end) {
+ fread(buf, size_buf, 1, *fp);
+ } else {
+ fread(buf, end - cur, 1, *fp);
+ memset(buf + end - cur, 0, size_buf - end + cur);
+ }
if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL)
head = cur + (pos - buf) + 7;
@@ -72,8 +77,8 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
/* We need to check if it is the object stored in stream */
while (memcmp(pos + 7,
"\r\nendstream", 11) == 0 &&
- (tmp = _memmem_whitespace(pos + 6,
- size_buf - (pos - buf) - 6,
+ (tmp = _memmem_whitespace(pos + 7,
+ size_buf - (pos - buf) - 7,
"endobj", 6)) != NULL)
pos = tmp;
@@ -102,13 +107,16 @@ _locate(pdf_object_t **pdf, FILE **fp, int size_buf)
ptr->address = head;
ptr->size = tail - head;
- fseek(*fp, tail + 6, SEEK_SET);
+ fseek(*fp, tail + 7, SEEK_SET);
head = tail = 0;
+ } else if (head > 0 && tail > 0) {
+ fseek(*fp, head, SEEK_SET);
+ tail = 0;
} else {
- fseek(*fp, -6, SEEK_CUR);
+ fseek(*fp, -7, SEEK_CUR);
}
- if ((cur = ftell(*fp)) + 6 >= end)
+ if ((cur = ftell(*fp)) + 7 >= end)
break;
}
@@ -159,11 +167,11 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
* A dictionary object may have nested dictionary,
* but it should not be in a stream
*/
- while ((tmp = _memmem_whitespace(tail + 2,
- ptr->size - (tail - buf) - 2,
+ while ((tmp = _memmem_whitespace(tail + 3,
+ ptr->size - (tail - buf) - 3,
">>", 2)) != NULL &&
- memmem(tail + 2,
- ptr->size - (tail - buf) - 2,
+ memmem(tail + 3,
+ ptr->size - (tail - buf) - 3,
"stream\r\n", 8) == NULL)
tail = tmp;
@@ -190,8 +198,8 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
while (_memmem_whitespace(tail,
ptr->size - (tail - buf),
"endobj", 6) != NULL &&
- (tmp = _memmem_whitespace(tail + 9,
- ptr->size - (tail - buf) - 9,
+ (tmp = _memmem_whitespace(tail + 10,
+ ptr->size - (tail - buf) - 10,
"endstream", 9)) != NULL)
tail = tmp;