/*
* Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
#ifdef __linux__
#define _GNU_SOURCE
#endif /* __linux__ */
#include <stdlib.h>
#include <string.h>
#include "pdf.h"
static void *
_memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
{
const char whitespace[6] = {
0x00,
0x09,
0x0a,
0x0c,
0x0d,
0x20
};
char *ret = NULL;
char str[s1 + 1];
memcpy(str, p1, s1);
size_t tmp_size = 0;
char *tmp;
for (int i = 0; i < 6; i++) {
str[s1] = whitespace[i];
if ((tmp = memmem(p0, s0, str, s1 + 1)) == NULL)
continue;
if (tmp_size == 0 || (size_t) (tmp - (char *) p0) < tmp_size) {
tmp_size = tmp - (char *) p0;
ret = tmp;
}
}
return ret;
}
static int
_locate(pdf_object_t **pdf, FILE **fp, int size_buf)
{
pdf_object_t *ptr = *pdf;
while (ptr->next != NULL)
ptr = ptr->next;
char buf[size_buf];
long cur = ftell(*fp);
long end;
fseek(*fp, 0, SEEK_END);
end = ftell(*fp);
fseek(*fp, cur, SEEK_SET);
long head = 0;
long tail = 0;
char *pos;
char *tmp;
for (;;) {
if (cur + size_buf < end) {
fread(buf, size_buf, 1, *fp);
} else {
fread(buf, end - cur, 1, *fp);
memset(buf + end - cur, 0, size_buf - end + cur);
}
if (head == 0) {
/* Hack needed for invalid object */
pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6);
tmp = memmem(buf, size_buf, " 0 obj", 6);
while (tmp != NULL && tmp[6] != 0x3c && tmp[6] != 0x5b)
tmp = memmem(tmp + 6, size_buf - (tmp - buf) - 6, " 0 obj", 6);
if (pos != NULL && tmp != NULL) {
if (pos - buf < tmp - buf)
head = cur + (pos - buf) + 7;
else
head = cur + (tmp - buf) + 6;
} else if (pos != NULL) {
head = cur + (pos - buf) + 7;
} else if (tmp != NULL) {
head = cur + (tmp - buf) + 6;
}
}
if (tail == 0 && (pos = _memmem_whitespace(buf, size_buf, "endobj", 6)) != NULL) {
/* We need to check if it is the object stored in stream */
while (memcmp(pos + 7,
"\r\nendstream", 11) == 0 &&
(tmp = _memmem_whitespace(pos + 7,
size_buf - (pos - buf) - 7,
"endobj", 6)) != NULL)
pos = tmp;
if (pos - buf < size_buf - 7)
tail = cur + (pos - buf);
}
if (tail > head) {
if (ptr->next == NULL) {
ptr->next = malloc(sizeof(pdf_object_t));
if (ptr->next == NULL)
return 1;
ptr->next->id = 0;
ptr->next->object_size = 0;
ptr->next->object = NULL;
ptr->next->dictionary_size = 0;
ptr->next->dictionary = NULL;
ptr->next->stream_size = 0;
ptr->next->stream = NULL;
ptr->next->next = NULL;
ptr = ptr->next;
}
ptr->address = head;
ptr->size = tail - head;
fseek(*fp, tail + 7, SEEK_SET);
head = tail = 0;
} else if (head > 0 && tail > 0) {
if (cur + size_buf < end)
fseek(*fp, head, SEEK_SET);
tail = 0;
} else {
fseek(*fp, -7, SEEK_CUR);
}
if ((cur = ftell(*fp)) + 7 >= end)
break;
}
return 0;
}
int
pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
{
if (*pdf == NULL || *fp == NULL || size_buf < 7)
return 1;
if (_locate(pdf, fp, size_buf) != 0)
return 1;
pdf_object_t *ptr = (*pdf)->next;
char str[8];
char *buf;
char *head;
char *tail;
char *tmp;
while (ptr != NULL) {
buf = malloc(ptr->size);
if (buf == NULL)
return 1;
fseek(*fp, ptr->address, SEEK_SET);
fread(buf, ptr->size, 1, *fp);
/* Handle incomplete object */
head = buf;
while ((tmp = _memmem_whitespace(head,
ptr->size - (head - buf),
" 0 obj", 6)) != NULL)
head = tmp + 7;
/* Hack needed for invalid object */
while ((tmp = memmem(head,
ptr->size - (head - buf),
" 0 obj", 6)) != NULL)
head = tmp + 6;
if (head - buf > 0) {
ptr->address += head - buf;
ptr->size -= head - buf;
tmp = realloc(buf, ptr->size);
if (tmp == NULL)
return 1;
buf = tmp;
fseek(*fp, ptr->address, SEEK_SET);
fread(buf, ptr->size, 1, *fp);
}
/* Hack needed for invalid object */
fseek(*fp, ptr->address - 14, SEEK_SET);
fread(str, 8, 1, *fp);
if (str[7] < '0' || str[7] > '9') {
fseek(*fp, ptr->address - 15, SEEK_SET);
fread(str, 8, 1, *fp);
}
for (int i = 7; i >= 0; i--) {
if (str[i] < '0' || str[i] > '9') {
if (i < 7)
ptr->id = atoi(str + i + 1);
else
ptr->id = 0;
break;
}
}
if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL &&
((tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL ||
/* Hack needed for invalid object */
(tail = memmem(buf, ptr->size, ">>", 2)) != NULL)) {
if (memmem(buf, tail - buf, "stream\r\n", 8) != NULL) {
tail = memmem(buf, ptr->size, ">>", 2);
while (ptr->size - (tail - buf) > 2 &&
(tmp = memmem(tail + 2,
ptr->size - (tail - buf) - 2,
">>", 2)) != NULL &&
memmem(tail + 2,
(tmp - tail) - 2,
"stream\r\n", 8) == NULL)
tail = tmp;
} else {
/*
* A dictionary object may have nested dictionary,
* but it should not be in a stream
*/
while (ptr->size - (tail - buf) > 3 &&
(tmp = _memmem_whitespace(tail + 3,
ptr->size - (tail - buf) - 3,
">>", 2)) != NULL &&
memmem(tail + 3,
(tmp - tail) - 3,
"stream\r\n", 8) == NULL)
tail = tmp;
}
ptr->dictionary_size = tail - head + 2;
ptr->dictionary = malloc(ptr->dictionary_size + 1);
if (ptr->dictionary == NULL)
return 1;
memcpy(ptr->dictionary, head, ptr->dictionary_size);
memset(ptr->dictionary + ptr->dictionary_size, 0, 1);
if ((head = memmem(tail,
ptr->size - (tail - buf),
"stream\r\n", 8)) != NULL &&
(tail = _memmem_whitespace(head,
ptr->size - (head - buf),
"endstream", 9)) != NULL) {
/*
* An object may contain a stream that
* contains another object that
* contains another stream
*/
while (_memmem_whitespace(tail + 10,
ptr->size - (tail - buf) - 10,
"endobj", 6) != NULL &&
(tmp = _memmem_whitespace(tail + 10,
ptr->size - (tail - buf) - 10,
"endstream", 9)) != NULL)
tail = tmp;
ptr->stream_size = (tail - head) - 8;
ptr->stream = malloc(ptr->stream_size);
if (ptr->stream == NULL)
return 1;
memcpy(ptr->stream, head + 8, ptr->stream_size);
}
free(buf);
} else {
ptr->object_size = ptr->size;
ptr->object = buf;
}
ptr = ptr->next;
}
return 0;
}