aboutsummaryrefslogblamecommitdiffstats
path: root/src/pdf_parser.c
blob: e6d8ac60cdf0f84c7007b611867fa13f5c141475 (plain) (tree)
1
2
  
                                                 


















                                                                        





                     

          
                         
 




                            

                                     








                                                                               

         
                   

















                                                    

                      



                  





                                                                         
 


















                                                                                               




                                                                                                  

                                                                  



























                                                                         
                                                       
                                        
                                                  

                                                           
                                 
                        
                                                 

                 
                                                  
















                                                        
                    










                                        
































                                                           
                                      
 




                                                                






                                                                    



                                      
                                                                       


                                                                                        
























                                                                                  
 





                                                                           
                                                                            
                                                                             











                                                                      

                                                                      
                                                               

                                                                            










                                                                                

                                  
                        

                                                     

                 




                                
/*
 * Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#ifdef __linux__

#define _GNU_SOURCE

#endif /* __linux__ */

#include <stdlib.h>
#include <string.h>

#include "pdf.h"

static void *
_memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1)
{
	const char whitespace[6] = {
		0x00,
		0x09,
		0x0a,
		0x0c,
		0x0d,
		0x20
	};

	char *ret = NULL;

	char str[s1 + 1];
	memcpy(str, p1, s1);

	size_t tmp_size = 0;
	char *tmp;

	for (int i = 0; i < 6; i++) {
		str[s1] = whitespace[i];

		if ((tmp = memmem(p0, s0, str, s1 + 1)) == NULL)
			continue;

		if (tmp_size == 0 || (size_t) (tmp - (char *) p0) < tmp_size) {
			tmp_size = tmp - (char *) p0;
			ret = tmp;
		}
	}

	return ret;
}

static int
_locate(pdf_object_t **pdf, FILE **fp, int size_buf)
{
	pdf_object_t *ptr = *pdf;
	while (ptr->next != NULL)
		ptr = ptr->next;

	char buf[size_buf];

	long cur = ftell(*fp);
	long end;

	fseek(*fp, 0, SEEK_END);
	end = ftell(*fp);
	fseek(*fp, cur, SEEK_SET);

	long head = 0;
	long tail = 0;
	char *pos;
	char *tmp;

	for (;;) {
		if (cur + size_buf < end) {
			fread(buf, size_buf, 1, *fp);
		} else {
			fread(buf, end - cur, 1, *fp);
			memset(buf + end - cur, 0, size_buf - end + cur);
		}

		if (head == 0) {
			/* Hack needed for invalid object */
			pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6);
			tmp = memmem(buf, size_buf, " 0 obj", 6);

			while (tmp != NULL && tmp[6] != 0x3c && tmp[6] != 0x5b)
				tmp = memmem(tmp + 6, size_buf - (tmp - buf) - 6, " 0 obj", 6);

			if (pos != NULL && tmp != NULL) {
				if (pos - buf < tmp - buf)
					head = cur + (pos - buf) + 7;
				else
					head = cur + (tmp - buf) + 6;
			} else if (pos != NULL) {
				head = cur + (pos - buf) + 7;
			} else if (tmp != NULL) {
				head = cur + (tmp - buf) + 6;
			}
		}

		if (tail == 0 && (pos = _memmem_whitespace(buf, size_buf, "endobj", 6)) != NULL) {
			/* We need to check if it is the object stored in stream */
			while (memcmp(pos + 7,
				"\r\nendstream", 11) == 0 &&
				(tmp = _memmem_whitespace(pos + 7,
				size_buf - (pos - buf) - 7,
				"endobj", 6)) != NULL)
					pos = tmp;

			if (pos - buf < size_buf - 7)
				tail = cur + (pos - buf);
		}

		if (tail > head) {
			if (ptr->next == NULL) {
				ptr->next = malloc(sizeof(pdf_object_t));

				if (ptr->next == NULL)
					return 1;

				ptr->next->id = 0;
				ptr->next->object_size = 0;
				ptr->next->object = NULL;
				ptr->next->dictionary_size = 0;
				ptr->next->dictionary = NULL;
				ptr->next->stream_size = 0;
				ptr->next->stream = NULL;
				ptr->next->next = NULL;
				ptr = ptr->next;
			}

			ptr->address = head;
			ptr->size = tail - head;

			fseek(*fp, tail + 7, SEEK_SET);
			head = tail = 0;
		} else if (head > 0 && tail > 0) {
			if (cur + size_buf < end)
				fseek(*fp, head, SEEK_SET);
			tail = 0;
		} else {
			fseek(*fp, -7, SEEK_CUR);
		}

		if ((cur = ftell(*fp)) + 7 >= end)
			break;
	}

	return 0;
}

int
pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
{
	if (*pdf == NULL || *fp == NULL || size_buf < 7)
		return 1;

	if (_locate(pdf, fp, size_buf) != 0)
		return 1;

	pdf_object_t *ptr = (*pdf)->next;

	char str[8];
	char *buf;
	char *head;
	char *tail;
	char *tmp;

	while (ptr != NULL) {
		buf = malloc(ptr->size);

		if (buf == NULL)
			return 1;

		fseek(*fp, ptr->address, SEEK_SET);
		fread(buf, ptr->size, 1, *fp);

		/* Handle incomplete object */
		head = buf;
		while ((tmp = _memmem_whitespace(head,
			ptr->size - (head - buf),
			" 0 obj", 6)) != NULL)
			head = tmp + 7;

		/* Hack needed for invalid object */
		while ((tmp = memmem(head,
			ptr->size - (head - buf),
			" 0 obj", 6)) != NULL)
			head = tmp + 6;

		if (head - buf > 0) {
			ptr->address += head - buf;
			ptr->size -= head - buf;

			tmp = realloc(buf, ptr->size);

			if (tmp == NULL)
				return 1;

			buf = tmp;

			fseek(*fp, ptr->address, SEEK_SET);
			fread(buf, ptr->size, 1, *fp);
		}

		/* Hack needed for invalid object */
		fseek(*fp, ptr->address - 14, SEEK_SET);
		fread(str, 8, 1, *fp);

		if (str[7] < '0' || str[7] > '9') {
			fseek(*fp, ptr->address - 15, SEEK_SET);
			fread(str, 8, 1, *fp);
		}

		for (int i = 7; i >= 0; i--) {
			if (str[i] < '0' || str[i] > '9') {
				if (i < 7)
					ptr->id = atoi(str + i + 1);
				else
					ptr->id = 0;

				break;
			}
		}

		if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL &&
			((tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL ||
			/* Hack needed for invalid object */
			(tail = memmem(buf, ptr->size, ">>", 2)) != NULL)) {
			if (memmem(buf, tail - buf, "stream\r\n", 8) != NULL) {
				tail = memmem(buf, ptr->size, ">>", 2);

				while (ptr->size - (tail - buf) > 2 &&
					(tmp = memmem(tail + 2,
					ptr->size - (tail - buf) - 2,
					">>", 2)) != NULL &&
					memmem(tail + 2,
					(tmp - tail) - 2,
					"stream\r\n", 8) == NULL)
					tail = tmp;
			} else {
				/*
				 * A dictionary object may have nested dictionary,
				 * but it should not be in a stream
				 */
				while (ptr->size - (tail - buf) > 3 &&
					(tmp = _memmem_whitespace(tail + 3,
					ptr->size - (tail - buf) - 3,
					">>", 2)) != NULL &&
					memmem(tail + 3,
					(tmp - tail) - 3,
					"stream\r\n", 8) == NULL)
					tail = tmp;
			}

			ptr->dictionary_size = tail - head + 2;
			ptr->dictionary = malloc(ptr->dictionary_size + 1);

			if (ptr->dictionary == NULL)
				return 1;

			memcpy(ptr->dictionary, head, ptr->dictionary_size);
			memset(ptr->dictionary + ptr->dictionary_size, 0, 1);

			if ((head = memmem(tail,
				ptr->size - (tail - buf),
				"stream\r\n", 8)) != NULL &&
				(tail = _memmem_whitespace(head,
				ptr->size - (head - buf),
				"endstream", 9)) != NULL) {
				/*
				 * An object may contain a stream that
				 * contains another object that
				 * contains another stream
				 */
				while (_memmem_whitespace(tail + 10,
					ptr->size - (tail - buf) - 10,
					"endobj", 6) != NULL &&
					(tmp = _memmem_whitespace(tail + 10,
					ptr->size - (tail - buf) - 10,
					"endstream", 9)) != NULL)
						tail = tmp;

				ptr->stream_size = (tail - head) - 8;
				ptr->stream = malloc(ptr->stream_size);

				if (ptr->stream == NULL)
					return 1;

				memcpy(ptr->stream, head + 8, ptr->stream_size);
			}

			free(buf);
		} else {
			ptr->object_size = ptr->size;
			ptr->object = buf;
		}

		ptr = ptr->next;
	}

	return 0;
}