From c2ad6549fb337ce707e04aa441c9b492171a3b9d Mon Sep 17 00:00:00 2001 From: yzrh Date: Sun, 25 Dec 2022 18:03:01 +0000 Subject: Handle headless HN and page with no image. Signed-off-by: yzrh --- src/cnki.c | 29 ++++++- src/cnki.h | 5 +- src/cnki_hn.c | 110 ++++++++++++----------- src/cnki_pdf.c | 265 +++++++++++++++++++++++++++++--------------------------- src/cnki_zlib.c | 9 +- src/melon.c | 3 +- 6 files changed, 239 insertions(+), 182 deletions(-) diff --git a/src/cnki.c b/src/cnki.c index 5f120d0..cc49d73 100644 --- a/src/cnki.c +++ b/src/cnki.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, yzrh + * Copyright (c) 2020-2022, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -54,6 +54,11 @@ cnki_destroy(cnki_t **param) object_hn_t *ptr_hn; while ((ptr_hn = (*param)->object_hn) != NULL) { (*param)->object_hn = (*param)->object_hn->next; + free(ptr_hn->text); + if (ptr_hn->image_data != NULL) + for (int i = 0; i < ptr_hn->image_length; i++) + free(ptr_hn->image_data[i].image); + free(ptr_hn->image_data); free(ptr_hn); } @@ -71,12 +76,19 @@ cnki_info(cnki_t **param) printf("Reading file header at 0x%x\n", ADDRESS_HEAD); int addr[2]; + unsigned char str[2]; fseek((*param)->fp_i, ADDRESS_HEAD, SEEK_SET); fread((*param)->file_stat->type, 4, 1, (*param)->fp_i); - if ((*param)->stat > 0) - printf("File type is '%s'\n", (*param)->file_stat->type); + fread(str, 2, 1, (*param)->fp_i); + + if ((*param)->stat > 0) { + if ((unsigned char) (*param)->file_stat->type[0] > 0x7f) + printf("File type is '%02x'\n", (unsigned char) (*param)->file_stat->type[0]); + else + printf("File type is '%s'\n", (*param)->file_stat->type); + } if (strncmp((*param)->file_stat->type, "%PDF", 4) == 0) { return 0; @@ -86,6 +98,9 @@ cnki_info(cnki_t **param) } else if (strncmp((*param)->file_stat->type, "HN", 2) == 0) { addr[0] = ADDRESS_HN_PAGE; addr[1] = ADDRESS_HN_OUTLINE; + } else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) { + addr[0] = ADDRESS_C8_PAGE; + addr[1] = ADDRESS_HN_OUTLINE; } else if (strncmp((*param)->file_stat->type, "KDH ", 4) == 0) { return 0; } else { @@ -102,6 +117,14 @@ cnki_info(cnki_t **param) printf("Advised %d page(s)\n", (*param)->file_stat->page); + if (strncmp((*param)->file_stat->type, "HN", 2) == 0 && str[0] == 0xc8 && str[1] == 0x00) { + fseek((*param)->fp_i, 0xd8, SEEK_SET); + return 0; + } else if ((unsigned char) (*param)->file_stat->type[0] == 0xc8) { + fseek((*param)->fp_i, 0x50, SEEK_SET); + return 0; + } + if ((*param)->stat > 1) printf("Reading outline count at 0x%x\n", addr[1]); diff --git a/src/cnki.h b/src/cnki.h index 237a2c1..193e69b 100644 --- a/src/cnki.h +++ b/src/cnki.h @@ -16,6 +16,8 @@ #define ADDRESS_HN_PAGE 0x0090 #define ADDRESS_HN_OUTLINE 0x0158 +#define ADDRESS_C8_PAGE 0x0008 + #define ADDRESS_KDH_BODY 0x00fe #define KEY_KDH "FZHMEI" @@ -64,7 +66,8 @@ typedef struct _object_hn_t { int32_t text_size; int16_t image_length; int16_t page; - int32_t unknown[2]; /* TODO: what is it? */ + int32_t unknown; /* TODO: what is it? */ + int32_t address_next; char *text; struct _hn_image_t *image_data; struct _object_hn_t *next; diff --git a/src/cnki_hn.c b/src/cnki_hn.c index feabb48..4d32092 100644 --- a/src/cnki_hn.c +++ b/src/cnki_hn.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, yzrh + * Copyright (c) 2020-2022, yzrh * * SPDX-License-Identifier: Apache-2.0 */ @@ -27,12 +27,13 @@ cnki_hn(cnki_t **param) if ((*param)->stat > 1) { printf("Loading page(s)\n"); - printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4s\t%8s\t%8s\n", + printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4s\t%8s\t%8s\n", "address", "text", "length", "page", "unknown", + "next", "code", "address", "image"); @@ -44,7 +45,8 @@ cnki_hn(cnki_t **param) fread(&ptr->text_size, 4, 1, (*param)->fp_i); fread(&ptr->image_length, 2, 1, (*param)->fp_i); fread(&ptr->page, 2, 1, (*param)->fp_i); - fread(&ptr->unknown, 8, 1, (*param)->fp_i); + fread(&ptr->unknown, 4, 1, (*param)->fp_i); + fread(&ptr->address_next, 4, 1, (*param)->fp_i); ptr->text = NULL; ptr->image_data = NULL; @@ -62,66 +64,76 @@ cnki_hn(cnki_t **param) ptr = (*param)->object_hn; while (ptr != NULL) { - ptr->text = malloc(ptr->text_size); + if (ptr->text_size > 0) { + ptr->text = malloc(ptr->text_size); - if (ptr->text == NULL) - return 1; + if (ptr->text == NULL) + return 1; - fseek((*param)->fp_i, ptr->address, SEEK_SET); - fread(ptr->text, ptr->text_size, 1, (*param)->fp_i); + fseek((*param)->fp_i, ptr->address, SEEK_SET); + fread(ptr->text, ptr->text_size, 1, (*param)->fp_i); + } if ((*param)->stat > 1) - printf("\t%08x\t%8d\t%6d\t%4d\t{%4d, %8d}", + printf("\t%08x\t%8d\t%6d\t%4d\t%8d\t%08x", ptr->address, ptr->text_size, ptr->image_length, ptr->page, - ptr->unknown[0], - ptr->unknown[1]); - - ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t)); - - if (ptr->image_data == NULL) - return 1; + ptr->unknown, + ptr->address_next); - for (int i = 0; i < ptr->image_length; i++) { - fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i); - fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i); - fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i); - fseek((*param)->fp_i, - ptr->image_data[i].address + ptr->image_data[i].size, - SEEK_SET); - } - - for (int i = 0; i < ptr->image_length; i++) { - ptr->image_data[i].image = malloc(ptr->image_data[i].size); + if (ptr->image_length > 0) { + ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t)); - if (ptr->image_data[i].image == NULL) + if (ptr->image_data == NULL) return 1; - fseek((*param)->fp_i, ptr->image_data[i].address, SEEK_SET); - fread(ptr->image_data[i].image, - ptr->image_data[i].size, 1, - (*param)->fp_i); - - if ((*param)->stat > 1) { - if (i == 0) { - printf("\t%4d\t%08x\t%8d\n", - ptr->image_data[i].format, - ptr->image_data[i].address, - ptr->image_data[i].size); - } else { - printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4d\t%08x\t%8d\n", - "", - "", - "", - "", - "", - ptr->image_data[i].format, - ptr->image_data[i].address, - ptr->image_data[i].size); + for (int i = 0; i < ptr->image_length; i++) { + fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i); + fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i); + fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i); + fseek((*param)->fp_i, + ptr->image_data[i].address + ptr->image_data[i].size, + SEEK_SET); + } + + for (int i = 0; i < ptr->image_length; i++) { + ptr->image_data[i].image = malloc(ptr->image_data[i].size); + + if (ptr->image_data[i].image == NULL) + return 1; + + fseek((*param)->fp_i, ptr->image_data[i].address, SEEK_SET); + fread(ptr->image_data[i].image, + ptr->image_data[i].size, 1, + (*param)->fp_i); + + if ((*param)->stat > 1) { + if (i == 0) { + printf("\t%4d\t%08x\t%8d\n", + ptr->image_data[i].format, + ptr->image_data[i].address, + ptr->image_data[i].size); + } else { + printf("\t%8s\t%8s\t%6s\t%4s\t%8s\t%8s\t%4d\t%08x\t%8d\n", + "", + "", + "", + "", + "", + "", + ptr->image_data[i].format, + ptr->image_data[i].address, + ptr->image_data[i].size); + } } } + } else if ((*param)->stat > 1) { + printf("\t%4s\t%8s\t%8s\n", + "", + "", + ""); } ptr = ptr->next; diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index c56a45f..603ce01 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -481,7 +481,10 @@ cnki_pdf_hn(cnki_t **param) * page object */ int *ids = NULL; - pdf_get_free_ids(&pdf, &ids, ptr->image_length + 3); + if (ptr->image_length > 0) + pdf_get_free_ids(&pdf, &ids, ptr->image_length + 3); + else + pdf_get_free_ids(&pdf, &ids, 2); int bitmap_size; char *bitmap; @@ -489,27 +492,31 @@ cnki_pdf_hn(cnki_t **param) int stream_size; char *stream; - int *dim = malloc(2 * ptr->image_length * sizeof(int)); + int *dim; - int ret; - int info[3]; + if (ptr->image_length > 0) { + dim = malloc(2 * ptr->image_length * sizeof(int)); - if (dim == NULL) { - free(root_kid); - free(ids); - return 1; - } + if (dim == NULL) { + free(root_kid); + free(ids); + return 1; + } - dictionary_size = 256; - dictionary = malloc(dictionary_size); + dictionary_size = 256; + dictionary = malloc(dictionary_size); - if (dictionary == NULL) { - free(root_kid); - free(ids); - free(dim); - return 1; + if (dictionary == NULL) { + free(root_kid); + free(ids); + free(dim); + return 1; + } } + int ret; + int info[3]; + for (int i = 0; i < ptr->image_length; i++) { memset(dictionary, 0, dictionary_size); @@ -684,66 +691,42 @@ cnki_pdf_hn(cnki_t **param) } } - memset(dictionary, 0, dictionary_size); + if (ptr->image_length > 0) { + memset(dictionary, 0, dictionary_size); - strcat(dictionary, "<<\n/XObject <<"); + strcat(dictionary, "<<\n/XObject <<"); - for (int i = 0; i < ptr->image_length; i++) { - snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]); - strcat(dictionary, buf); + for (int i = 0; i < ptr->image_length; i++) { + snprintf(buf, 64, "/Im%d %d 0 R", i, ids[i]); + strcat(dictionary, buf); - if (i + 1 < ptr->image_length) - strcat(dictionary, " "); - } + if (i + 1 < ptr->image_length) + strcat(dictionary, " "); + } - strcat(dictionary, ">>\n>>"); + strcat(dictionary, ">>\n>>"); - pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0); + pdf_obj_append(&pdf, ids[ptr->image_length], NULL, dictionary, NULL, 0); - free(dictionary); + free(dictionary); + } int conv_size; char *conv_dst; char conv_src[2]; char conv_hex[3]; - if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0) { - cnki_zlib(&stream, &stream_size, ptr->text, ptr->text_size); - - dictionary_size = 64 + 2 * stream_size; - dictionary = malloc(dictionary_size); + if (ptr->text_size > 0) { + if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0 || + strncmp(ptr->text, "COMPRESSTEXT", 12) == 0) { + cnki_zlib(&stream, &stream_size, ptr->text, ptr->text_size); - if (dictionary == NULL) { - free(root_kid); - free(ids); - free(dim); - return 1; - } + free(ptr->text); - memset(dictionary, 0, dictionary_size); - - strcat(dictionary, "text_size = stream_size; + ptr->text = stream; } - free(stream); - strcat(dictionary, ">"); - } else { dictionary_size = 64 + 2 * ptr->text_size; dictionary = malloc(dictionary_size); @@ -758,9 +741,26 @@ cnki_pdf_hn(cnki_t **param) strcat(dictionary, "text_size; i += 4) { - conv_src[0] = ptr->text[i + 3]; - conv_src[1] = ptr->text[i + 2]; + for (int i = 0; i < ptr->text_size; i += 6) { + if (i + 5 >= ptr->text_size) + break; + + conv_src[0] = ptr->text[i + 5]; + conv_src[1] = ptr->text[i + 4]; + + if ((conv_src[0] << 8 | conv_src[1]) == 0xa389) { + strcat(dictionary, "a389"); + continue; + } else if ((conv_src[0] << 8 | conv_src[1]) == 0xa38a) { + strcat(dictionary, "a38a"); + continue; + } else if ((conv_src[0] << 8 | conv_src[1]) == 0xa38d) { + strcat(dictionary, "a38d"); + continue; + } else if ((conv_src[0] << 8 | conv_src[1]) == 0xa3a0) { + strcat(dictionary, "a3a0"); + continue; + } conv_size = 6; @@ -776,12 +776,12 @@ cnki_pdf_hn(cnki_t **param) } strcat(dictionary, ">"); - } - /* FIXME: Use the text somehow? */ - free(dictionary); + /* FIXME: Use the text somehow? */ + free(dictionary); + } - dictionary_size = 64 + 64 * ptr->image_length; + dictionary_size = 64 + 128 * ptr->image_length; dictionary = malloc(dictionary_size); if (dictionary == NULL) { @@ -791,96 +791,109 @@ cnki_pdf_hn(cnki_t **param) return 1; } - memset(dictionary, 0, dictionary_size); - - strcat(dictionary, "q\n"); + if (ptr->image_length > 0) { + memset(dictionary, 0, dictionary_size); - strcat(dictionary, "0.25 0 0 0.25 0 0 cm\n"); + strcat(dictionary, "q\n"); - double resize_x; - double resize_y; + strcat(dictionary, "0.25 0 0 0.25 0 0 cm\n"); - for (int i = 0; i < ptr->image_length; i++) { - if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0) - continue; + double resize_x; + double resize_y; - /* Scale within bound of A4 paper */ - resize_x = 595.276 * 4 / dim[i * 2]; - resize_y = 841.89 * 4 / dim[i * 2 + 1]; + for (int i = 0; i < ptr->image_length; i++) { + if (dim[i * 2] <= 0 || dim[i * 2 + 1] <= 0) + continue; - if (resize_y < resize_x) - snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", - resize_y, resize_y); - else - snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", - resize_x, resize_x); - strcat(dictionary, buf); + /* Scale within bound of A4 paper */ + resize_x = 595.276 * 4 / dim[i * 2]; + resize_y = 841.89 * 4 / dim[i * 2 + 1]; - /* Apply transformation matrix */ - if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) { - snprintf(buf, 64, "1 0 0 1 0 %d cm\n", - dim[i * 2 + 1]); + if (resize_y < resize_x) + snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", + resize_y, resize_y); + else + snprintf(buf, 64, "%f 0 0 %f 0 0 cm\n", + resize_x, resize_x); strcat(dictionary, buf); - strcat(dictionary, "1 0 0 -1 0 0 cm\n"); - } + /* Apply transformation matrix */ + if (ptr->image_data[i].format == JBIG || ptr->image_data[i].format == DCT_1) { + snprintf(buf, 64, "1 0 0 1 0 %d cm\n", + dim[i * 2 + 1]); + strcat(dictionary, buf); - snprintf(buf, 64, "%d 0 0 %d 0 0 cm\n", - dim[i * 2], dim[i * 2 + 1]); - strcat(dictionary, buf); + strcat(dictionary, "1 0 0 -1 0 0 cm\n"); + } - snprintf(buf, 64, "/Im%d Do\n", i); - strcat(dictionary, buf); - } + snprintf(buf, 64, "%d 0 0 %d 0 0 cm\n", + dim[i * 2], dim[i * 2 + 1]); + strcat(dictionary, buf); - strcat(dictionary, "Q"); + snprintf(buf, 64, "/Im%d Do\n", i); + strcat(dictionary, buf); + } - if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) { - free(root_kid); - free(ids); - free(dim); - free(dictionary); - return 1; - } + strcat(dictionary, "Q"); - memset(dictionary, 0, dictionary_size); + if (strdeflate(&stream, &stream_size, dictionary, strlen(dictionary)) != 0) { + free(root_kid); + free(ids); + free(dim); + free(dictionary); + return 1; + } - strcat(dictionary, "<<\n"); + memset(dictionary, 0, dictionary_size); - snprintf(buf, 64, "/Length %d\n", stream_size); - strcat(dictionary, buf); + strcat(dictionary, "<<\n"); - strcat(dictionary, "/Filter /FlateDecode\n"); + snprintf(buf, 64, "/Length %d\n", stream_size); + strcat(dictionary, buf); - strcat(dictionary, ">>"); + strcat(dictionary, "/Filter /FlateDecode\n"); + + strcat(dictionary, ">>"); - pdf_obj_append(&pdf, ids[ptr->image_length + 1], - NULL, dictionary, stream, stream_size); + pdf_obj_append(&pdf, ids[ptr->image_length + 1], + NULL, dictionary, stream, stream_size); - free(stream); + free(stream); + } memset(dictionary, 0, dictionary_size); strcat(dictionary, "<<\n/Type /Page\n"); - snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]); - strcat(dictionary, buf); - - snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]); - strcat(dictionary, buf); - /* A4 paper */ strcat(dictionary, "/MediaBox [0 0 595.276 841.89]\n"); - /* Add /Parent when we know root */ - pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0); + if (ptr->image_length > 0) { + free(dim); - free(dictionary); + snprintf(buf, 64, "/Resources %d 0 R\n", ids[ptr->image_length]); + strcat(dictionary, buf); + + snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length + 1]); + strcat(dictionary, buf); + + /* Add /Parent when we know root */ + pdf_obj_append(&pdf, ids[ptr->image_length + 2], NULL, dictionary, NULL, 0); - root_kid[cnt++] = ids[ptr->image_length + 2]; + root_kid[cnt++] = ids[ptr->image_length + 2]; + } else { + snprintf(buf, 64, "/Contents %d 0 R\n", ids[ptr->image_length]); + strcat(dictionary, buf); + + /* Add /Parent when we know root */ + pdf_obj_append(&pdf, ids[ptr->image_length + 1], NULL, dictionary, NULL, 0); + + root_kid[cnt++] = ids[ptr->image_length + 1]; + } + + free(dictionary); free(ids); - free(dim); ptr = ptr->next; } diff --git a/src/cnki_zlib.c b/src/cnki_zlib.c index edff141..075456b 100644 --- a/src/cnki_zlib.c +++ b/src/cnki_zlib.c @@ -13,12 +13,17 @@ int cnki_zlib(char **dst, int *dst_size, const char * restrict src, int src_size) { + uint8_t padding = 0; int32_t size; - memcpy(&size, src + 20, 4); + + if (strncmp(src + 8, "COMPRESSTEXT", 12) == 0) + padding = 8; + + memcpy(&size, src + 12 + padding, 4); *dst_size = size; - if (strinflate(dst, size, src + 24, src_size - 24) != 0) + if (strinflate(dst, size, src + 16 + padding, src_size - 16 - padding) != 0) return 1; return 0; diff --git a/src/melon.c b/src/melon.c index af6aaf4..f8bb645 100644 --- a/src/melon.c +++ b/src/melon.c @@ -98,7 +98,8 @@ main(int argc, char **argv) strerror(errno)); return EXIT_FAILURE; } - } else if (strncmp(param->file_stat->type, "HN", 2) == 0) { + } else if (strncmp(param->file_stat->type, "HN", 2) == 0 || + (unsigned char) param->file_stat->type[0] == 0xc8) { if (cnki_hn(¶m) != 0) { fprintf(stderr, "%s: %s\n", argv[0], strerror(errno)); -- cgit v1.2.3