diff options
author | yzrh <yzrh@tuta.io> | 2020-08-14 22:04:26 +0000 |
---|---|---|
committer | yzrh <yzrh@tuta.io> | 2020-08-14 22:04:26 +0000 |
commit | 12ecdd71592eccf7bdb6214edbc7318246469c1c (patch) | |
tree | fda27e41c37a2345702ad3e90480154d975e426f /src/pdf_parser.c | |
download | melon-12ecdd71592eccf7bdb6214edbc7318246469c1c.tar.gz melon-12ecdd71592eccf7bdb6214edbc7318246469c1c.tar.zst |
Initial commit.
Diffstat (limited to 'src/pdf_parser.c')
-rw-r--r-- | src/pdf_parser.c | 216 |
1 files changed, 216 insertions, 0 deletions
diff --git a/src/pdf_parser.c b/src/pdf_parser.c new file mode 100644 index 0000000..1da8dff --- /dev/null +++ b/src/pdf_parser.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2020, yzrh <yzrh@tuta.io> + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifdef __linux__ + +#define _GNU_SOURCE + +#endif /* __linux__ */ + +#include <stdlib.h> +#include <string.h> + +#include "pdf.h" + +static void * +_memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1) +{ + const char whitespace[6] = { + '\r', + '\n', + '\f', + '\t', + '\0', + ' ' + }; + + char tmp[s1 + 1]; + memcpy(tmp, p1, s1); + + char *ret; + + for (int i = 0; i < 6; i++) { + tmp[s1] = whitespace[i]; + if((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL) + return ret; + } + + return NULL; +} + +static int +_locate(pdf_object_t **pdf, FILE **fp, int size_buf) +{ + pdf_object_t *ptr = *pdf; + while (ptr->next != NULL) + ptr = ptr->next; + + char buf[size_buf]; + + long cur = ftell(*fp); + long end; + + fseek(*fp, 0, SEEK_END); + end = ftell(*fp); + fseek(*fp, cur, SEEK_SET); + + int head = 0; + int tail = 0; + char *pos; + char *tmp; + + for (;;) { + fread(buf, size_buf, 1, *fp); + + if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL) + head = cur + (pos - buf) + 7; + + if (tail == 0 && (pos = _memmem_whitespace(buf, size_buf, "endobj", 6)) != NULL) { + /* We need to check if it is the object stored in stream */ + while (memcmp(pos + 7, + "\r\nendstream", 11) == 0 && + (tmp = _memmem_whitespace(pos + 6, + size_buf - (pos - buf) - 6, + "endobj", 6)) != NULL) + pos = tmp; + + if (pos - buf < size_buf - 7) + tail = cur + (pos - buf); + } + + if (tail > head) { + if (ptr->next == NULL) { + ptr->next = malloc(sizeof(pdf_object_t)); + + if (ptr->next == NULL) + return 1; + + ptr->next->id = 0; + ptr->next->object_size = 0; + ptr->next->object = NULL; + ptr->next->dictionary_size = 0; + ptr->next->dictionary = NULL; + ptr->next->stream_size = 0; + ptr->next->stream = NULL; + ptr->next->next = NULL; + ptr = ptr->next; + } + + ptr->address = head; + ptr->size = tail - head; + + fseek(*fp, tail + 6, SEEK_SET); + head = tail = 0; + } else { + fseek(*fp, -6, SEEK_CUR); + } + + if ((cur = ftell(*fp)) + 6 >= end) + break; + } + + return 0; +} + +int +pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) +{ + if (*pdf == NULL || *fp == NULL || size_buf < 7) + return 1; + + if (_locate(pdf, fp, size_buf) != 0) + return 1; + + pdf_object_t *ptr = (*pdf)->next; + + char *buf; + char *head; + char *tail; + char *tmp; + + while (ptr != NULL) { + buf = malloc(ptr->size); + + if (buf == NULL) + return 1; + + memset(buf, 0, ptr->size); + + fseek(*fp, ptr->address - 12, SEEK_SET); + fread(buf, 8, 1, *fp); + + for (int i = 0; i < 8; i++) { + if (buf[i] >= '0' && buf[i] <= '9') { + ptr->id = atoi(buf + i); + break; + } + } + + fseek(*fp, ptr->address, SEEK_SET); + fread(buf, ptr->size, 1, *fp); + + if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL && + (tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL) { + /* A dictionary object may have nested dictionary */ + while ((tmp = _memmem_whitespace(tail + 2, + ptr->size - (tail - buf) - 2, + ">>", 2)) != NULL) + tail = tmp; + + ptr->dictionary_size = tail - head + 2; + ptr->dictionary = malloc(ptr->dictionary_size + 1); + + if (ptr->dictionary == NULL) + return 1; + + memset(ptr->dictionary, 0, ptr->dictionary_size + 1); + memcpy(ptr->dictionary, head, ptr->dictionary_size); + + if ((head = memmem(tail, + ptr->size - (tail - buf), + "stream\r\n", 8)) != NULL && + (tail = _memmem_whitespace(head, + ptr->size - (head - buf), + "endstream", 9)) != NULL) { + /* + * An object may contain a stream that + * contains another object that + * contains another stream + */ + while (_memmem_whitespace(tail, + ptr->size - (tail - buf), + "endobj", 6) != NULL && + (tmp = _memmem_whitespace(tail + 9, + ptr->size - (tail - buf) - 9, + "endstream", 9)) != NULL) + tail = tmp; + + ptr->stream_size = (tail - head) - 8; + ptr->stream = malloc(ptr->stream_size); + + if (ptr->stream == NULL) + return 1; + + memcpy(ptr->stream, head + 8, ptr->stream_size); + } + } else { + ptr->object_size = ptr->size; + ptr->object = malloc(ptr->object_size + 1); + + if (ptr->object == NULL) + return 1; + + memset(ptr->object, 0, ptr->object_size + 1); + memcpy(ptr->object, buf, ptr->object_size); + } + + free(buf); + + ptr = ptr->next; + } + + return 0; +} |