aboutsummaryrefslogtreecommitdiffstats
path: root/src/cnki_pdf.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/cnki_pdf.c')
-rw-r--r--src/cnki_pdf.c632
1 files changed, 624 insertions, 8 deletions
diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c
index 8c9fbb8..bb76a04 100644
--- a/src/cnki_pdf.c
+++ b/src/cnki_pdf.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020, yzrh <yzrh@noema.org>
+ * Copyright (c) 2020-2021, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@@ -8,6 +8,9 @@
#include <string.h>
#include "cnki.h"
+#include "iconv.h"
+#include "zlib.h"
+#include "jpeg.h"
#include "pdf.h"
#include "pdf_cnki.h"
@@ -57,6 +60,11 @@ cnki_pdf(cnki_t **param)
printf("Loaded %d object(s)\n",
pdf_get_count(&pdf));
+ int dictionary_size;
+ char *dictionary;
+
+ char buf[64];
+
if ((*param)->stat > 1)
printf("Searching for parent object(s)\n");
@@ -69,12 +77,8 @@ cnki_pdf(cnki_t **param)
if ((*param)->stat > 0)
printf("Discovered %d parent object(s)\n", parent[0]);
- char buf[64];
-
int parent_missing[parent[0]];
int *kid;
- int dictionary_size;
- char *dictionary;
for (int i = 1; i <= parent[0]; i++) {
if ((*param)->stat > 1)
@@ -101,20 +105,23 @@ cnki_pdf(cnki_t **param)
snprintf(buf, 64,
"<<\n/Type /Pages\n/Kids [");
strcat(dictionary, buf);
+
for (int j = 1; j <= kid[0]; j++) {
snprintf(buf, 64,
"%d 0 R",
kid[j]);
strcat(dictionary, buf);
+
if (j < kid[0])
strcat(dictionary, " ");
}
+
snprintf(buf, 64,
"]\n/Count %d\n>>",
pdf_get_kid_count(&pdf, parent[i]));
strcat(dictionary, buf);
- pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL);
+ pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL, 0);
parent_missing[i - 1] = 1;
@@ -185,6 +192,7 @@ cnki_pdf(cnki_t **param)
if (parent_missing[i]) {
snprintf(buf, 64, "%d 0 R", parent[i + 1]);
strcat(dictionary, buf);
+
if (i < root_kid)
strcat(dictionary, " ");
}
@@ -200,7 +208,7 @@ cnki_pdf(cnki_t **param)
strcat(dictionary, ">>");
- pdf_obj_prepend(&pdf, root, NULL, dictionary, NULL);
+ pdf_obj_prepend(&pdf, root, NULL, dictionary, NULL, 0);
memset(dictionary, 0, dictionary_size);
@@ -260,7 +268,7 @@ cnki_pdf(cnki_t **param)
strcat(dictionary, ">>");
- pdf_obj_append(&pdf, 0, NULL, dictionary, NULL);
+ pdf_obj_append(&pdf, 0, NULL, dictionary, NULL, 0);
if ((*param)->stat > 0)
printf("Generated catalog object\n");
@@ -383,3 +391,611 @@ cnki_pdf(cnki_t **param)
return 0;
}
+
+int
+cnki_pdf_hn(cnki_t **param)
+{
+ if (*param == NULL)
+ return 1;
+
+ pdf_object_t *pdf = NULL;
+
+ if (pdf_obj_create(&pdf) != 0)
+ return 1;
+
+ if ((*param)->stat > 1)
+ printf("Generating PDF object(s)\n");
+
+ int dictionary_size;
+ char *dictionary;
+
+ char buf[64];
+
+ int *ids = NULL;
+
+ int cnt = 0;
+ int *root_kid = malloc((*param)->file_stat->page * sizeof(int));
+
+ if (root_kid == NULL)
+ return 1;
+
+ memset(root_kid, 0, (*param)->file_stat->page);
+
+ object_hn_t *ptr = (*param)->object_hn;
+ while (ptr != NULL) {
+ /*
+ * External object (ptr->image_length) +
+ * content object +
+ * resource object +
+ * page object
+ */
+ pdf_get_free_ids(&pdf, &ids, ptr->image_length + 3);
+
+ int stream_size;
+ char *stream;
+
+ int *dim = malloc(2 * ptr->image_length * sizeof(int));
+
+ int ret;
+ int wh[2];
+
+ if (dim == NULL) {
+ free(root_kid);
+ return 1;
+ }
+
+ for (int i = 0; i < ptr->image_length; i++) {
+ dictionary_size = 128;
+ dictionary = malloc(dictionary_size);
+
+ if (dictionary == NULL) {
+ free(root_kid);
+ free(dim);
+ return 1;
+ }
+
+ memset(dictionary, 0, dictionary_size);
+
+ strcat(dictionary, "<<\n/Type /XObject\n"
+ "/Subtype /Image\n");
+
+ if ((*param)->stat > 2)
+ printf("\tDecoding data, page %04d item %02d... ",
+ ptr->page, i);
+
+ switch (ptr->image_data[i].format) {
+ case JBIG:
+ ret = cnki_jbig(&stream,
+ &stream_size,
+ &wh[0],
+ &wh[1],
+ ptr->image_data[i].image,
+ ptr->image_data[i].size);
+
+ if (ret != 0) {
+ dim[i * 2] = 0;
+ dim[i * 2 + 1] = 0;
+ break;
+ }
+
+ snprintf(buf, 64, "/Width %d\n/Height %d\n",
+ wh[0], wh[1]);
+ strcat(dictionary, buf);
+
+ strcat(dictionary, "/ColorSpace /DeviceGray\n"
+ "/BitsPerComponent 1\n");
+
+ snprintf(buf, 64, "/Length %d\n",
+ stream_size);
+ strcat(dictionary, buf);
+
+ strcat(dictionary, "/Filter /CCITTFaxDecode\n");
+
+ dim[i * 2] = wh[0];
+ dim[i * 2 + 1] = wh[1];
+ break;
+ case DCT_0:
+ case DCT_1:
+ ret = strinfo_jpeg_dim(&wh[0],
+ &wh[1],
+ ptr->image_data[i].image,
+ ptr->image_data[i].size);
+
+ if (ret != 0) {
+ dim[i * 2] = 0;
+ dim[i * 2 + 1] = 0;
+ break;
+ }
+
+ stream_size = ptr->image_data[i].size;
+ stream = malloc(stream_size);
+ if (stream == NULL) {
+ free(dictionary);
+ free(root_kid);
+ free(dim);
+ return 1;
+ }
+ memcpy(stream, ptr->image_data[i].image, stream_size);
+
+ snprintf(buf, 64, "/Width %d\n/Height %d\n",
+ wh[0], wh[1]);
+ strcat(dictionary, buf);
+
+ strcat(dictionary, "/ColorSpace /DeviceRGB\n"
+ "/BitsPerComponent 8\n");
+
+ snprintf(buf, 64, "/Length %d\n",
+ stream_size);
+ strcat(dictionary, buf);
+
+ strcat(dictionary, "/Filter /DCTDecode\n");
+
+ dim[i * 2] = wh[0];
+ dim[i * 2 + 1] = wh[1];
+ break;
+ case JBIG2:
+ case JPX:
+ default:
+ ret = -1;
+ dim[i * 2] = -1;
+ dim[i * 2 + 1] = -1;
+ break;
+ }
+
+ strcat(dictionary, ">>");
+
+ if (ret == 0) {
+ if ((*param)->stat > 2)
+ printf("Succeed\n");
+
+ pdf_obj_append(&pdf, ids[i],
+ NULL, dictionary, stream, stream_size);
+
+ free(dictionary);
+ free(stream);
+ } else if (ret == 1) {
+ if ((*param)->stat > 2)
+ printf("; Failed\n");
+
+ free(dictionary);
+
+ pdf_obj_append(&pdf, ids[i], NULL, NULL, NULL, 0);
+ } else {
+ free(dictionary);
+ }
+ }
+
+ dictionary_size = 128;
+ dictionary = malloc(dictionary_size);
+
+ if (dictionary == NULL) {
+ free(root_kid);
+ free(dim);
+ return 1;
+ }
+
+ memset(dictionary, 0, dictionary_size);
+
+ strcat(dictionary, "<<\n/XObject <<");
+
+ for (int i = 0; i < ptr->image_length; i++) {
+ snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]);
+ strcat(dictionary, buf);
+
+ if (i + 1 < ptr->image_length)
+ strcat(dictionary, " ");
+ }
+
+ strcat(dictionary, ">>\n>>");
+
+ pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0);
+
+ free(dictionary);
+
+ int conv_size;
+ char *conv_dst;
+ char conv_src[2];
+ char conv_hex[3];
+
+ if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0) {
+ cnki_zlib(&stream, &stream_size, ptr->text, ptr->text_size);
+
+ dictionary_size = stream_size / 8 + 7;
+ dictionary = malloc(dictionary_size);
+
+ if (dictionary == NULL) {
+ free(root_kid);
+ free(dim);
+ return 1;
+ }
+
+ memset(dictionary, 0, dictionary_size);
+
+ strcat(dictionary, "<feff");
+
+ for (int i = 0; i < stream_size; i += 16) {
+ conv_src[0] = stream[i + 7];
+ conv_src[1] = stream[i + 6];
+
+ conv_size = 6;
+
+ if (strconv(&conv_dst, "UTF-16BE",
+ conv_src, "GB18030", &conv_size) == 0) {
+ for (int j = 0; j < conv_size - 2; j++) {
+ snprintf(conv_hex, 3,
+ "%02x", (unsigned char) conv_dst[j]);
+ strcat(dictionary, conv_hex);
+ }
+ free(conv_dst);
+ }
+ }
+ free(stream);
+
+ strcat(dictionary, ">");
+ } else {
+ dictionary_size = ptr->text_size;
+ dictionary = malloc(dictionary_size);
+
+ if (dictionary == NULL) {
+ free(root_kid);
+ free(dim);
+ return 1;
+ }
+
+ memset(dictionary, 0, dictionary_size);
+
+ strcat(dictionary, "<feff");
+
+ for (int i = 0; i < ptr->text_size; i += 4) {
+ conv_src[0] = ptr->text[i + 3];
+ conv_src[1] = ptr->text[i + 2];
+
+ conv_size = 6;
+
+ if (strconv(&conv_dst, "UTF-16BE",
+ conv_src, "GB18030", &conv_size) == 0) {
+ for (int j = 0; j < conv_size - 2; j++) {
+ snprintf(conv_hex, 3,
+ "%02x", (unsigned char) conv_dst[j]);
+ strcat(dictionary, conv_hex);
+ }
+ free(conv_dst);
+ }
+ }
+
+ strcat(dictionary, ">");
+ }
+
+ /* FIXME: Use the text somehow? */
+ free(dictionary);
+
+ dictionary_size = 64 + 12 * ptr->image_length;
+ dictionary = malloc(dictionary_size);
+
+ if (dictionary == NULL) {
+ free(root_kid);
+ free(dim);
+ return 1;
+ }
+
+ memset(dictionary, 0, dictionary_size);
+
+ strcat(dictionary, "q\n");
+
+ strcat(dictionary, "0.120000 0 0 0.120000 0 0 cm\n");
+
+ for (int i = 0; i < ptr->image_length; i++) {
+ if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0)
+ continue;
+
+ /* Apply transformation matrix */
+ if (ptr->image_data[i].format == DCT_1)
+ strcat(dictionary, "-1 0 0 -1 0 0 cm\n");
+
+ snprintf(buf, 64, "%d 0 0 %d 0 0 cm\n",
+ dim[i * 2], dim[i * 2 + 1]);
+ strcat(dictionary, buf);
+
+ snprintf(buf, 64, "/Im%d Do\n", i);
+ strcat(dictionary, buf);
+ }
+
+ strcat(dictionary, "Q");
+
+ if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) {
+ free(dictionary);
+ free(root_kid);
+ free(dim);
+ return 1;
+ }
+
+ memset(dictionary, 0, dictionary_size);
+
+ strcat(dictionary, "<<\n");
+
+ snprintf(buf, 64, "/Length %d\n", stream_size);
+ strcat(dictionary, buf);
+
+ strcat(dictionary, "/Filter /FlateDecode\n");
+
+ strcat(dictionary, ">>");
+
+ pdf_obj_append(&pdf, ids[ptr->image_length + 1],
+ NULL, dictionary, stream, stream_size);
+
+ free(stream);
+
+ memset(dictionary, 0, dictionary_size);
+
+ strcat(dictionary, "<<\n/Type /Page\n");
+
+ snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]);
+ strcat(dictionary, buf);
+
+ snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]);
+ strcat(dictionary, buf);
+
+ /* A4 paper */
+ strcat(dictionary, "/MediaBox [ 0 0 595.276 841.89 ]\n");
+
+ /* Add /Parent when we know root */
+ pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0);
+
+ free(dictionary);
+
+ root_kid[cnt++] = ids[ptr->image_length + 2];
+
+ free(ids);
+ ids = NULL;
+
+ free(dim);
+
+ ptr = ptr->next;
+ }
+
+ if ((*param)->stat > 1) {
+ printf("\t%8s\t%12s\t%12s\t%12s\n",
+ "id",
+ "object",
+ "dictionary",
+ "stream");
+
+ pdf_object_t *ptr = pdf->next;
+ while (ptr != NULL) {
+ printf("\t%8d\t%12d\t%12d\t%12d\n",
+ ptr->id,
+ ptr->object_size,
+ ptr->dictionary_size,
+ ptr->stream_size);
+ ptr = ptr->next;
+ }
+ }
+
+ if ((*param)->stat > 0)
+ printf("Generated %d object(s)\n",
+ pdf_get_count(&pdf));
+
+ ids = NULL;
+
+ if ((*param)->file_stat->outline > 0) {
+ if ((*param)->stat > 1)
+ printf("Generating outline object(s)\n\t%8s\n", "id");
+
+ pdf_get_free_ids(&pdf, &ids, (*param)->file_stat->outline + 1);
+ int outline = pdf_cnki_outline(&pdf, &(*param)->object_outline, &ids);
+
+ if ((*param)->stat > 1)
+ for (int i = 0; i < (*param)->file_stat->outline + 1; i++)
+ printf("\t%8d\n", ids[i]);
+
+ if ((*param)->stat > 0) {
+ if (outline != 0)
+ printf("No outline information\n");
+ else
+ printf("Generated %d outline object(s)\n",
+ (*param)->file_stat->outline + 1);
+ }
+ }
+
+ if ((*param)->stat > 1)
+ printf("Generating root object\n");
+
+ dictionary_size = 64 + 12 * (*param)->file_stat->page;
+ dictionary = malloc(dictionary_size);
+
+ if (dictionary == NULL) {
+ free(root_kid);
+ return 1;
+ }
+
+ memset(dictionary, 0, dictionary_size);
+
+ int root = pdf_get_free_id(&pdf);
+
+ snprintf(buf, 64, "<<\n/Type /Pages\n/Kids ");
+ strcat(dictionary, buf);
+
+ if ((*param)->file_stat->page > 1)
+ strcat(dictionary, "[");
+
+ for (int i = 0; i < (*param)->file_stat->page; i++) {
+ snprintf(buf, 64, "%d 0 R", root_kid[i]);
+ strcat(dictionary, buf);
+ if (i + 1 < (*param)->file_stat->page)
+ strcat(dictionary, " ");
+ }
+
+ if ((*param)->file_stat->page > 1)
+ strcat(dictionary, "]");
+
+ strcat(dictionary, "\n");
+
+ snprintf(buf, 64, "/Count %d\n", (*param)->file_stat->page);
+ strcat(dictionary, buf);
+
+ strcat(dictionary, ">>");
+
+ pdf_obj_prepend(&pdf, root, NULL, dictionary, NULL, 0);
+
+ free(dictionary);
+
+ dictionary_size = 128;
+ dictionary = malloc(dictionary_size);
+
+ if (dictionary == NULL) {
+ free(root_kid);
+ return 1;
+ }
+
+ memset(dictionary, 0, dictionary_size);
+
+ pdf_object_t *tmp = NULL;
+
+ /* Add /Parent to page object */
+ for (int i = 0; i < (*param)->file_stat->page; i++) {
+ if (pdf_get_obj(&pdf, root_kid[i], &tmp) != 0) {
+ free(dictionary);
+ free(root_kid);
+ return 1;
+ }
+
+ memset(dictionary, 0, dictionary_size);
+
+ strcat(dictionary, tmp->dictionary);
+
+ snprintf(buf, 64, "/Parent %d 0 R\n>>", root);
+ strcat(dictionary, buf);
+
+ if (pdf_obj_replace(&pdf, root_kid[i], NULL, dictionary, NULL, 0) != 0) {
+ free(dictionary);
+ free(root_kid);
+ return 1;
+ }
+ }
+
+ free(root_kid);
+
+ memset(dictionary, 0, dictionary_size);
+
+ if ((*param)->stat > 0)
+ printf("Generated root object %d.\n",
+ root);
+
+ if ((*param)->stat > 1)
+ printf("Generating catalog object\n");
+
+ snprintf(buf, 64,
+ "<<\n/Type /Catalog\n/Pages %d 0 R\n",
+ root);
+ strcat(dictionary, buf);
+
+ if (ids != NULL) {
+ snprintf(buf, 64,
+ "/Outlines %d 0 R\n/PageMode /UseOutlines\n",
+ ids[0]);
+ strcat(dictionary, buf);
+ }
+
+ strcat(dictionary, ">>");
+
+ pdf_obj_append(&pdf, 0, NULL, dictionary, NULL, 0);
+
+ free(dictionary);
+
+ if ((*param)->stat > 0)
+ printf("Generated catalog object\n");
+
+ if ((*param)->stat > 1)
+ printf("Sorting object(s)\n");
+
+ pdf_obj_sort(&pdf);
+
+ if ((*param)->stat > 0)
+ printf("Sorted object(s)\n");
+
+ if ((*param)->stat > 1)
+ printf("Writing header\n");
+
+ long cur = 0;
+
+ if ((*param)->stat > 0)
+ cur = ftell((*param)->fp_o);
+
+ if (pdf_dump_header(&pdf, &(*param)->fp_o) != 0) {
+ fprintf(stderr, "Header not written\n");
+ return 1;
+ } else {
+ if ((*param)->stat > 0)
+ printf("Header %ld byte(s) written\n",
+ ftell((*param)->fp_o) - cur);
+ }
+
+ if ((*param)->stat > 1)
+ printf("Writing object(s)\n");
+
+ pdf_dump_obj(&pdf, &(*param)->fp_o);
+
+ if ((*param)->stat > 1) {
+ printf("\t%8s\t%8s\t%8s\t%12s\t%12s\t%12s\n",
+ "address",
+ "size",
+ "id",
+ "object",
+ "dictionary",
+ "stream");
+
+ pdf_object_t *ptr = pdf->next;
+ while (ptr != NULL) {
+ printf("\t%08x\t%8d\t%8d\t%12d\t%12d\t%12d\n",
+ ptr->address,
+ ptr->size,
+ ptr->id,
+ ptr->object_size,
+ ptr->dictionary_size,
+ ptr->stream_size);
+ ptr = ptr->next;
+ }
+ }
+
+ if ((*param)->stat > 0)
+ printf("%d object(s) %ld byte(s) written\n",
+ pdf_get_count(&pdf),
+ ftell((*param)->fp_o));
+
+ long xref = ftell((*param)->fp_o);
+
+ if ((*param)->stat > 1)
+ printf("Writing cross-reference table\n");
+
+ if (pdf_dump_xref(&pdf, &(*param)->fp_o) != 0) {
+ if ((*param)->stat > 0)
+ printf("Cross-reference table not written\n");
+ } else {
+ if ((*param)->stat > 0)
+ printf("Cross-reference table %ld byte(s) written\n",
+ ftell((*param)->fp_o) - xref);
+ }
+
+ if ((*param)->stat > 1)
+ printf("Writing trailer\n");
+
+ if ((*param)->stat > 0)
+ cur = ftell((*param)->fp_o);
+
+ if (pdf_dump_trailer(&pdf, &(*param)->fp_o, xref) != 0) {
+ if ((*param)->stat > 0)
+ printf("Trailer not written\n");
+ } else {
+ if ((*param)->stat > 0)
+ printf("Trailer %ld byte(s) written\n",
+ ftell((*param)->fp_o) - cur);
+ }
+
+ if ((*param)->stat > 0)
+ printf("Total %ld byte(s) written\n",
+ ftell((*param)->fp_o));
+
+ pdf_obj_destroy(&pdf);
+
+ return 0;
+}