diff options
author | yzrh <yzrh@noema.org> | 2020-12-31 22:36:28 +0000 |
---|---|---|
committer | yzrh <yzrh@noema.org> | 2021-01-03 03:01:28 +0000 |
commit | 1994f122cc29504862944cca1da1c5203c7e41eb (patch) | |
tree | d89e37c5f3443156116bd8476e0efc2d380acd55 /src/cnki_pdf.c | |
parent | b20c6ad3ed930977990f3812b25b80d2ce282d79 (diff) | |
download | melon-1994f122cc29504862944cca1da1c5203c7e41eb.tar.gz melon-1994f122cc29504862944cca1da1c5203c7e41eb.tar.zst |
Decode JBIG and JPEG during HN conversion.
Diffstat (limited to 'src/cnki_pdf.c')
-rw-r--r-- | src/cnki_pdf.c | 632 |
1 files changed, 624 insertions, 8 deletions
diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index 8c9fbb8..bb76a04 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, yzrh <yzrh@noema.org> + * Copyright (c) 2020-2021, yzrh <yzrh@noema.org> * * SPDX-License-Identifier: Apache-2.0 */ @@ -8,6 +8,9 @@ #include <string.h> #include "cnki.h" +#include "iconv.h" +#include "zlib.h" +#include "jpeg.h" #include "pdf.h" #include "pdf_cnki.h" @@ -57,6 +60,11 @@ cnki_pdf(cnki_t **param) printf("Loaded %d object(s)\n", pdf_get_count(&pdf)); + int dictionary_size; + char *dictionary; + + char buf[64]; + if ((*param)->stat > 1) printf("Searching for parent object(s)\n"); @@ -69,12 +77,8 @@ cnki_pdf(cnki_t **param) if ((*param)->stat > 0) printf("Discovered %d parent object(s)\n", parent[0]); - char buf[64]; - int parent_missing[parent[0]]; int *kid; - int dictionary_size; - char *dictionary; for (int i = 1; i <= parent[0]; i++) { if ((*param)->stat > 1) @@ -101,20 +105,23 @@ cnki_pdf(cnki_t **param) snprintf(buf, 64, "<<\n/Type /Pages\n/Kids ["); strcat(dictionary, buf); + for (int j = 1; j <= kid[0]; j++) { snprintf(buf, 64, "%d 0 R", kid[j]); strcat(dictionary, buf); + if (j < kid[0]) strcat(dictionary, " "); } + snprintf(buf, 64, "]\n/Count %d\n>>", pdf_get_kid_count(&pdf, parent[i])); strcat(dictionary, buf); - pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL); + pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL, 0); parent_missing[i - 1] = 1; @@ -185,6 +192,7 @@ cnki_pdf(cnki_t **param) if (parent_missing[i]) { snprintf(buf, 64, "%d 0 R", parent[i + 1]); strcat(dictionary, buf); + if (i < root_kid) strcat(dictionary, " "); } @@ -200,7 +208,7 @@ cnki_pdf(cnki_t **param) strcat(dictionary, ">>"); - pdf_obj_prepend(&pdf, root, NULL, dictionary, NULL); + pdf_obj_prepend(&pdf, root, NULL, dictionary, NULL, 0); memset(dictionary, 0, dictionary_size); @@ -260,7 +268,7 @@ cnki_pdf(cnki_t **param) strcat(dictionary, ">>"); - pdf_obj_append(&pdf, 0, NULL, dictionary, NULL); + pdf_obj_append(&pdf, 0, NULL, dictionary, NULL, 0); if ((*param)->stat > 0) printf("Generated catalog object\n"); @@ -383,3 +391,611 @@ cnki_pdf(cnki_t **param) return 0; } + +int +cnki_pdf_hn(cnki_t **param) +{ + if (*param == NULL) + return 1; + + pdf_object_t *pdf = NULL; + + if (pdf_obj_create(&pdf) != 0) + return 1; + + if ((*param)->stat > 1) + printf("Generating PDF object(s)\n"); + + int dictionary_size; + char *dictionary; + + char buf[64]; + + int *ids = NULL; + + int cnt = 0; + int *root_kid = malloc((*param)->file_stat->page * sizeof(int)); + + if (root_kid == NULL) + return 1; + + memset(root_kid, 0, (*param)->file_stat->page); + + object_hn_t *ptr = (*param)->object_hn; + while (ptr != NULL) { + /* + * External object (ptr->image_length) + + * content object + + * resource object + + * page object + */ + pdf_get_free_ids(&pdf, &ids, ptr->image_length + 3); + + int stream_size; + char *stream; + + int *dim = malloc(2 * ptr->image_length * sizeof(int)); + + int ret; + int wh[2]; + + if (dim == NULL) { + free(root_kid); + return 1; + } + + for (int i = 0; i < ptr->image_length; i++) { + dictionary_size = 128; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "<<\n/Type /XObject\n" + "/Subtype /Image\n"); + + if ((*param)->stat > 2) + printf("\tDecoding data, page %04d item %02d... ", + ptr->page, i); + + switch (ptr->image_data[i].format) { + case JBIG: + ret = cnki_jbig(&stream, + &stream_size, + &wh[0], + &wh[1], + ptr->image_data[i].image, + ptr->image_data[i].size); + + if (ret != 0) { + dim[i * 2] = 0; + dim[i * 2 + 1] = 0; + break; + } + + snprintf(buf, 64, "/Width %d\n/Height %d\n", + wh[0], wh[1]); + strcat(dictionary, buf); + + strcat(dictionary, "/ColorSpace /DeviceGray\n" + "/BitsPerComponent 1\n"); + + snprintf(buf, 64, "/Length %d\n", + stream_size); + strcat(dictionary, buf); + + strcat(dictionary, "/Filter /CCITTFaxDecode\n"); + + dim[i * 2] = wh[0]; + dim[i * 2 + 1] = wh[1]; + break; + case DCT_0: + case DCT_1: + ret = strinfo_jpeg_dim(&wh[0], + &wh[1], + ptr->image_data[i].image, + ptr->image_data[i].size); + + if (ret != 0) { + dim[i * 2] = 0; + dim[i * 2 + 1] = 0; + break; + } + + stream_size = ptr->image_data[i].size; + stream = malloc(stream_size); + if (stream == NULL) { + free(dictionary); + free(root_kid); + free(dim); + return 1; + } + memcpy(stream, ptr->image_data[i].image, stream_size); + + snprintf(buf, 64, "/Width %d\n/Height %d\n", + wh[0], wh[1]); + strcat(dictionary, buf); + + strcat(dictionary, "/ColorSpace /DeviceRGB\n" + "/BitsPerComponent 8\n"); + + snprintf(buf, 64, "/Length %d\n", + stream_size); + strcat(dictionary, buf); + + strcat(dictionary, "/Filter /DCTDecode\n"); + + dim[i * 2] = wh[0]; + dim[i * 2 + 1] = wh[1]; + break; + case JBIG2: + case JPX: + default: + ret = -1; + dim[i * 2] = -1; + dim[i * 2 + 1] = -1; + break; + } + + strcat(dictionary, ">>"); + + if (ret == 0) { + if ((*param)->stat > 2) + printf("Succeed\n"); + + pdf_obj_append(&pdf, ids[i], + NULL, dictionary, stream, stream_size); + + free(dictionary); + free(stream); + } else if (ret == 1) { + if ((*param)->stat > 2) + printf("; Failed\n"); + + free(dictionary); + + pdf_obj_append(&pdf, ids[i], NULL, NULL, NULL, 0); + } else { + free(dictionary); + } + } + + dictionary_size = 128; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "<<\n/XObject <<"); + + for (int i = 0; i < ptr->image_length; i++) { + snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]); + strcat(dictionary, buf); + + if (i + 1 < ptr->image_length) + strcat(dictionary, " "); + } + + strcat(dictionary, ">>\n>>"); + + pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0); + + free(dictionary); + + int conv_size; + char *conv_dst; + char conv_src[2]; + char conv_hex[3]; + + if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0) { + cnki_zlib(&stream, &stream_size, ptr->text, ptr->text_size); + + dictionary_size = stream_size / 8 + 7; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "<feff"); + + for (int i = 0; i < stream_size; i += 16) { + conv_src[0] = stream[i + 7]; + conv_src[1] = stream[i + 6]; + + conv_size = 6; + + if (strconv(&conv_dst, "UTF-16BE", + conv_src, "GB18030", &conv_size) == 0) { + for (int j = 0; j < conv_size - 2; j++) { + snprintf(conv_hex, 3, + "%02x", (unsigned char) conv_dst[j]); + strcat(dictionary, conv_hex); + } + free(conv_dst); + } + } + free(stream); + + strcat(dictionary, ">"); + } else { + dictionary_size = ptr->text_size; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "<feff"); + + for (int i = 0; i < ptr->text_size; i += 4) { + conv_src[0] = ptr->text[i + 3]; + conv_src[1] = ptr->text[i + 2]; + + conv_size = 6; + + if (strconv(&conv_dst, "UTF-16BE", + conv_src, "GB18030", &conv_size) == 0) { + for (int j = 0; j < conv_size - 2; j++) { + snprintf(conv_hex, 3, + "%02x", (unsigned char) conv_dst[j]); + strcat(dictionary, conv_hex); + } + free(conv_dst); + } + } + + strcat(dictionary, ">"); + } + + /* FIXME: Use the text somehow? */ + free(dictionary); + + dictionary_size = 64 + 12 * ptr->image_length; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "q\n"); + + strcat(dictionary, "0.120000 0 0 0.120000 0 0 cm\n"); + + for (int i = 0; i < ptr->image_length; i++) { + if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0) + continue; + + /* Apply transformation matrix */ + if (ptr->image_data[i].format == DCT_1) + strcat(dictionary, "-1 0 0 -1 0 0 cm\n"); + + snprintf(buf, 64, "%d 0 0 %d 0 0 cm\n", + dim[i * 2], dim[i * 2 + 1]); + strcat(dictionary, buf); + + snprintf(buf, 64, "/Im%d Do\n", i); + strcat(dictionary, buf); + } + + strcat(dictionary, "Q"); + + if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) { + free(dictionary); + free(root_kid); + free(dim); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "<<\n"); + + snprintf(buf, 64, "/Length %d\n", stream_size); + strcat(dictionary, buf); + + strcat(dictionary, "/Filter /FlateDecode\n"); + + strcat(dictionary, ">>"); + + pdf_obj_append(&pdf, ids[ptr->image_length + 1], + NULL, dictionary, stream, stream_size); + + free(stream); + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, "<<\n/Type /Page\n"); + + snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]); + strcat(dictionary, buf); + + snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]); + strcat(dictionary, buf); + + /* A4 paper */ + strcat(dictionary, "/MediaBox [ 0 0 595.276 841.89 ]\n"); + + /* Add /Parent when we know root */ + pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0); + + free(dictionary); + + root_kid[cnt++] = ids[ptr->image_length + 2]; + + free(ids); + ids = NULL; + + free(dim); + + ptr = ptr->next; + } + + if ((*param)->stat > 1) { + printf("\t%8s\t%12s\t%12s\t%12s\n", + "id", + "object", + "dictionary", + "stream"); + + pdf_object_t *ptr = pdf->next; + while (ptr != NULL) { + printf("\t%8d\t%12d\t%12d\t%12d\n", + ptr->id, + ptr->object_size, + ptr->dictionary_size, + ptr->stream_size); + ptr = ptr->next; + } + } + + if ((*param)->stat > 0) + printf("Generated %d object(s)\n", + pdf_get_count(&pdf)); + + ids = NULL; + + if ((*param)->file_stat->outline > 0) { + if ((*param)->stat > 1) + printf("Generating outline object(s)\n\t%8s\n", "id"); + + pdf_get_free_ids(&pdf, &ids, (*param)->file_stat->outline + 1); + int outline = pdf_cnki_outline(&pdf, &(*param)->object_outline, &ids); + + if ((*param)->stat > 1) + for (int i = 0; i < (*param)->file_stat->outline + 1; i++) + printf("\t%8d\n", ids[i]); + + if ((*param)->stat > 0) { + if (outline != 0) + printf("No outline information\n"); + else + printf("Generated %d outline object(s)\n", + (*param)->file_stat->outline + 1); + } + } + + if ((*param)->stat > 1) + printf("Generating root object\n"); + + dictionary_size = 64 + 12 * (*param)->file_stat->page; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + int root = pdf_get_free_id(&pdf); + + snprintf(buf, 64, "<<\n/Type /Pages\n/Kids "); + strcat(dictionary, buf); + + if ((*param)->file_stat->page > 1) + strcat(dictionary, "["); + + for (int i = 0; i < (*param)->file_stat->page; i++) { + snprintf(buf, 64, "%d 0 R", root_kid[i]); + strcat(dictionary, buf); + if (i + 1 < (*param)->file_stat->page) + strcat(dictionary, " "); + } + + if ((*param)->file_stat->page > 1) + strcat(dictionary, "]"); + + strcat(dictionary, "\n"); + + snprintf(buf, 64, "/Count %d\n", (*param)->file_stat->page); + strcat(dictionary, buf); + + strcat(dictionary, ">>"); + + pdf_obj_prepend(&pdf, root, NULL, dictionary, NULL, 0); + + free(dictionary); + + dictionary_size = 128; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) { + free(root_kid); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + pdf_object_t *tmp = NULL; + + /* Add /Parent to page object */ + for (int i = 0; i < (*param)->file_stat->page; i++) { + if (pdf_get_obj(&pdf, root_kid[i], &tmp) != 0) { + free(dictionary); + free(root_kid); + return 1; + } + + memset(dictionary, 0, dictionary_size); + + strcat(dictionary, tmp->dictionary); + + snprintf(buf, 64, "/Parent %d 0 R\n>>", root); + strcat(dictionary, buf); + + if (pdf_obj_replace(&pdf, root_kid[i], NULL, dictionary, NULL, 0) != 0) { + free(dictionary); + free(root_kid); + return 1; + } + } + + free(root_kid); + + memset(dictionary, 0, dictionary_size); + + if ((*param)->stat > 0) + printf("Generated root object %d.\n", + root); + + if ((*param)->stat > 1) + printf("Generating catalog object\n"); + + snprintf(buf, 64, + "<<\n/Type /Catalog\n/Pages %d 0 R\n", + root); + strcat(dictionary, buf); + + if (ids != NULL) { + snprintf(buf, 64, + "/Outlines %d 0 R\n/PageMode /UseOutlines\n", + ids[0]); + strcat(dictionary, buf); + } + + strcat(dictionary, ">>"); + + pdf_obj_append(&pdf, 0, NULL, dictionary, NULL, 0); + + free(dictionary); + + if ((*param)->stat > 0) + printf("Generated catalog object\n"); + + if ((*param)->stat > 1) + printf("Sorting object(s)\n"); + + pdf_obj_sort(&pdf); + + if ((*param)->stat > 0) + printf("Sorted object(s)\n"); + + if ((*param)->stat > 1) + printf("Writing header\n"); + + long cur = 0; + + if ((*param)->stat > 0) + cur = ftell((*param)->fp_o); + + if (pdf_dump_header(&pdf, &(*param)->fp_o) != 0) { + fprintf(stderr, "Header not written\n"); + return 1; + } else { + if ((*param)->stat > 0) + printf("Header %ld byte(s) written\n", + ftell((*param)->fp_o) - cur); + } + + if ((*param)->stat > 1) + printf("Writing object(s)\n"); + + pdf_dump_obj(&pdf, &(*param)->fp_o); + + if ((*param)->stat > 1) { + printf("\t%8s\t%8s\t%8s\t%12s\t%12s\t%12s\n", + "address", + "size", + "id", + "object", + "dictionary", + "stream"); + + pdf_object_t *ptr = pdf->next; + while (ptr != NULL) { + printf("\t%08x\t%8d\t%8d\t%12d\t%12d\t%12d\n", + ptr->address, + ptr->size, + ptr->id, + ptr->object_size, + ptr->dictionary_size, + ptr->stream_size); + ptr = ptr->next; + } + } + + if ((*param)->stat > 0) + printf("%d object(s) %ld byte(s) written\n", + pdf_get_count(&pdf), + ftell((*param)->fp_o)); + + long xref = ftell((*param)->fp_o); + + if ((*param)->stat > 1) + printf("Writing cross-reference table\n"); + + if (pdf_dump_xref(&pdf, &(*param)->fp_o) != 0) { + if ((*param)->stat > 0) + printf("Cross-reference table not written\n"); + } else { + if ((*param)->stat > 0) + printf("Cross-reference table %ld byte(s) written\n", + ftell((*param)->fp_o) - xref); + } + + if ((*param)->stat > 1) + printf("Writing trailer\n"); + + if ((*param)->stat > 0) + cur = ftell((*param)->fp_o); + + if (pdf_dump_trailer(&pdf, &(*param)->fp_o, xref) != 0) { + if ((*param)->stat > 0) + printf("Trailer not written\n"); + } else { + if ((*param)->stat > 0) + printf("Trailer %ld byte(s) written\n", + ftell((*param)->fp_o) - cur); + } + + if ((*param)->stat > 0) + printf("Total %ld byte(s) written\n", + ftell((*param)->fp_o)); + + pdf_obj_destroy(&pdf); + + return 0; +} |