From 12ecdd71592eccf7bdb6214edbc7318246469c1c Mon Sep 17 00:00:00 2001 From: yzrh Date: Fri, 14 Aug 2020 22:04:26 +0000 Subject: Initial commit. --- CHANGE.md | 4 + COPYING | 202 ++++++++++++++++++++++++++++ README.md | 40 ++++++ src/GNUmakefile | 28 ++++ src/Makefile | 28 ++++ src/cnki.c | 168 +++++++++++++++++++++++ src/cnki.h | 86 ++++++++++++ src/cnki_caj.c | 40 ++++++ src/cnki_kdh.c | 49 +++++++ src/cnki_nh.c | 110 +++++++++++++++ src/cnki_outline_tree.c | 73 ++++++++++ src/cnki_pdf.c | 351 ++++++++++++++++++++++++++++++++++++++++++++++++ src/cnki_xml.c | 14 ++ src/extern.h | 21 +++ src/iconv.c | 70 ++++++++++ src/iconv.h | 12 ++ src/melon.c | 127 ++++++++++++++++++ src/pdf.c | 228 +++++++++++++++++++++++++++++++ src/pdf.h | 61 +++++++++ src/pdf_cnki.c | 134 ++++++++++++++++++ src/pdf_cnki.h | 7 + src/pdf_get.c | 296 ++++++++++++++++++++++++++++++++++++++++ src/pdf_parser.c | 216 +++++++++++++++++++++++++++++ src/pdf_writer.c | 188 ++++++++++++++++++++++++++ src/version.h | 10 ++ 25 files changed, 2563 insertions(+) create mode 100644 CHANGE.md create mode 100644 COPYING create mode 100644 README.md create mode 100644 src/GNUmakefile create mode 100644 src/Makefile create mode 100644 src/cnki.c create mode 100644 src/cnki.h create mode 100644 src/cnki_caj.c create mode 100644 src/cnki_kdh.c create mode 100644 src/cnki_nh.c create mode 100644 src/cnki_outline_tree.c create mode 100644 src/cnki_pdf.c create mode 100644 src/cnki_xml.c create mode 100644 src/extern.h create mode 100644 src/iconv.c create mode 100644 src/iconv.h create mode 100644 src/melon.c create mode 100644 src/pdf.c create mode 100644 src/pdf.h create mode 100644 src/pdf_cnki.c create mode 100644 src/pdf_cnki.h create mode 100644 src/pdf_get.c create mode 100644 src/pdf_parser.c create mode 100644 src/pdf_writer.c create mode 100644 src/version.h diff --git a/CHANGE.md b/CHANGE.md new file mode 100644 index 0000000..1622de1 --- /dev/null +++ b/CHANGE.md @@ -0,0 +1,4 @@ +0.1.0 (2020-04-08) +================== + +* Initial release diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/COPYING @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md new file mode 100644 index 0000000..55e0fa9 --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +Melon +===== + +Melon: Converter that produces PDF from CNKI proprietary formats + +Development +----------- + +Currently, PDF, CAJ, and KDH can be converted. Please report +any failures with a sample that can reproduce the behaviour. + +KDH is essentially an invalid PDF file xor'ed with a predetermined key. +You may want to convert the decrypted KDH to valid PDF, although some +PDF readers can display the invalid PDF. + +Usage +===== + +`make` + +Optionally, `make install` + +`melon -o OUTPUT INPUT` + +Options +------- + +-o, --output +Specify output file + +-b, --buffer +Set buffer size (default 512k) + +-v, --verbose +Print more information (twice for even more) + +Thanks +====== + +This project is inspired by [https://github.com/JeziL/caj2pdf](https://github.com/JeziL/caj2pdf) diff --git a/src/GNUmakefile b/src/GNUmakefile new file mode 100644 index 0000000..118501a --- /dev/null +++ b/src/GNUmakefile @@ -0,0 +1,28 @@ +# +# Copyright (c) 2020, yzrh +# +# SPDX-License-Identifier: Apache-2.0 +# + +src != ls *.c +obj = ${src:.c=.o} + +PREFIX = /usr/local + +CFLAGS = -O3 -march=native -pipe -Wall +LDFLAGS = -Wl,-O3 -lcrypto -Wl,--as-needed + +all: ${obj} + ${CC} ${LDFLAGS} -o melon $^ + +clean: + rm -f melon ${obj} + +install: + install -d ${PREFIX}/bin + install melon ${PREFIX}/bin/ + +deinstall: + rm -f ${PREFIX}/bin/melon + +.PHONY: all clean install deinstall diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..33da2cd --- /dev/null +++ b/src/Makefile @@ -0,0 +1,28 @@ +# +# Copyright (c) 2020, yzrh +# +# SPDX-License-Identifier: Apache-2.0 +# + +src != ls *.c +obj = ${src:.c=.o} + +PREFIX = /usr/local + +CFLAGS = -O3 -march=native -pipe -flto=thin -Wall +LDFLAGS = -Wl,-O3 -lcrypto -Wl,--as-needed + +all: ${obj} + ${CC} ${LDFLAGS} -o melon $> + +clean: + rm -f melon ${obj} + +install: + install -d ${PREFIX}/bin + install melon ${PREFIX}/bin/ + +deinstall: + rm -f ${PREFIX}/bin/melon + +.PHONY: all clean install deinstall diff --git a/src/cnki.c b/src/cnki.c new file mode 100644 index 0000000..001be54 --- /dev/null +++ b/src/cnki.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include "cnki.h" + +int +cnki_create(cnki_t **param) +{ + if (*param != NULL) + return 1; + + *param = malloc(sizeof(cnki_t)); + + if (*param == NULL) + return 1; + + (*param)->stat = 0; + (*param)->size_buf = 524288; + (*param)->fp_i = NULL; + (*param)->fp_o = NULL; + + (*param)->file_stat = malloc(sizeof(file_stat_t)); + + if ((*param)->file_stat== NULL) + return 1; + + memset((*param)->file_stat, 0, sizeof(file_stat_t)); + + (*param)->object_outline = NULL; + (*param)->object_nh = NULL; + + return 0; +} + +void +cnki_destroy(cnki_t **param) +{ + if (*param != NULL) { + if ((*param)->file_stat != NULL) + free((*param)->file_stat); + if ((*param)->object_outline != NULL) + free((*param)->object_outline); + if ((*param)->object_nh != NULL) + free((*param)->object_nh); + free(*param); + } +} + +int +cnki_info(cnki_t **param) +{ + if (*param == NULL) + return 1; + + if ((*param)->stat > 1) + printf("Reading file header at %x\n", ADDRESS_HEAD); + + int addr[2]; + + fseek((*param)->fp_i, ADDRESS_HEAD, SEEK_SET); + fread((*param)->file_stat->type, 4, 1, (*param)->fp_i); + + if ((*param)->stat > 0) + printf("File type is '%s'\n", (*param)->file_stat->type); + + if (strcmp((*param)->file_stat->type, "%PDF") == 0) { + return 0; + } else if (strcmp((*param)->file_stat->type, "CAJ") == 0) { + addr[0] = ADDRESS_CAJ_PAGE; + addr[1] = ADDRESS_CAJ_OUTLINE; + } else if (strcmp((*param)->file_stat->type, "HN") == 0) { + addr[0] = ADDRESS_HN_PAGE; + addr[1] = ADDRESS_HN_OUTLINE; + } else if (strcmp((*param)->file_stat->type, "KDH ") == 0) { + return 0; + } else { + return 1; + } + + if ((*param)->stat > 1) + printf("Reading page count at %x\n", addr[0]); + + fseek((*param)->fp_i, addr[0], SEEK_SET); + fread(&(*param)->file_stat->page, 4, 1, (*param)->fp_i); + + if ((*param)->stat > 0) + printf("Advised %d page(s)\n", + (*param)->file_stat->page); + + if ((*param)->stat > 1) + printf("Reading outline count at %x\n", addr[1]); + + fseek((*param)->fp_i, addr[1], SEEK_SET); + fread(&(*param)->file_stat->outline, 4, 1, (*param)->fp_i); + + if ((*param)->stat > 0) + printf("Advised %d outline(s)\n", + (*param)->file_stat->outline); + + if ((*param)->file_stat->outline > 0) { + if ((*param)->stat > 1) { + printf("Loading outline(s)\n"); + printf("\t%16s\t%-24s\t%12s\t%12s\t%5s\n", + "title", + "hierarchy", + "page", + "text", + "depth"); + } + + (*param)->object_outline = malloc(sizeof(object_outline_t)); + + if ((*param)->object_outline == NULL) + return 1; + + object_outline_t *ptr = (*param)->object_outline; + for (int i = 0; i < (*param)->file_stat->outline; i++) { + fread(ptr->title, 256, 1, (*param)->fp_i); + fread(ptr->hierarchy, 24, 1, (*param)->fp_i); + fread(ptr->page, 12, 1, (*param)->fp_i); + fread(ptr->text, 12, 1, (*param)->fp_i); + fread(&ptr->depth, 4, 1, (*param)->fp_i); + + ptr->next = NULL; + + if ((*param)->stat > 1) { + printf("\t"); + for (int j = 1; j <= 256; j++) { + printf("%02x", (unsigned char) ptr->title[j - 1]); + + if (j % 8 == 0 && ptr->title[j] == '\0') + break; + + if (j % 8 == 0) + printf("\n\t"); + else if (j % 2 == 0) + printf(" "); + } + printf("\t%-24s\t%12s\t%12s\t%5d\n", + ptr->hierarchy, + ptr->page, + ptr->text, + ptr->depth); + } + + if (i < (*param)->file_stat->outline - 1) { + ptr->next = malloc(sizeof(object_outline_t)); + + if (ptr->next == NULL) + return 1; + } + + ptr = ptr->next; + } + + if ((*param)->stat > 0) + printf("Loaded %d outline(s)\n", + (*param)->file_stat->outline); + } + + return 0; +} diff --git a/src/cnki.h b/src/cnki.h new file mode 100644 index 0000000..6e3565f --- /dev/null +++ b/src/cnki.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#define ADDRESS_HEAD 0x0000 + +#define ADDRESS_CAJ_PAGE 0x0010 +#define ADDRESS_CAJ_OUTLINE 0x0110 +#define ADDRESS_CAJ_BODY 0x0014 + +#define ADDRESS_HN_PAGE 0x0090 +#define ADDRESS_HN_OUTLINE 0x0158 + +#define ADDRESS_KDH_BODY 0x00fe + +#define KEY_KDH "FZHMEI" +#define KEY_KDH_LENGTH 6 + +typedef struct _file_stat_t { + char type[4]; + int32_t page; + int32_t outline; +} file_stat_t; + +typedef struct _object_outline_t { + char title[256]; /* Starting at file_stat_t->outline + 4 */ + char hierarchy[24]; + char page[12]; + char text[12]; + int32_t depth; + struct _object_outline_t *next; +} object_outline_t; + +typedef struct _object_outline_tree_t { + int id; + struct _object_outline_t *item; + struct _object_outline_tree_t *up; + struct _object_outline_tree_t *left; + struct _object_outline_tree_t *right; +} object_outline_tree_t; + +typedef enum _nh_code { + CCITTFAX, + DCT_0, + DCT_1, + JBIG2, + JPX +} nh_code; + +typedef struct _object_nh_t { + int32_t address; /* Starting at end of object_outline_t */ + int32_t size; + int16_t page[2]; + int32_t zero[2]; + char *text; + int32_t image_format; /* nh_code */ + int32_t image_address; + int32_t image_size; + char *image; + struct _object_nh_t *next; +} object_nh_t; + +typedef struct _cnki_t { + int stat; + int size_buf; + FILE *fp_i; + FILE *fp_o; + file_stat_t *file_stat; + object_outline_t *object_outline; + object_nh_t *object_nh; +} cnki_t; + +/* cnki_pdf.c */ +int cnki_pdf(cnki_t **param); + +/* cnki_outline_tree.c */ +int cnki_outline_tree(object_outline_tree_t **outline_tree, + object_outline_t **outline, int *ids); + +/* cnki_xml.c */ +int cnki_xml(char **xml, FILE **fp); diff --git a/src/cnki_caj.c b/src/cnki_caj.c new file mode 100644 index 0000000..c541064 --- /dev/null +++ b/src/cnki_caj.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include "cnki.h" + +int +cnki_caj(cnki_t **param) +{ + if (*param == NULL) + return 1; + + if ((*param)->stat > 0) + printf("Begin 'CAJ' conversion\n"); + + if ((*param)->stat > 1) + printf("Reading document body address at %x\n", ADDRESS_CAJ_BODY); + + int addr; + + fseek((*param)->fp_i, ADDRESS_CAJ_BODY, SEEK_SET); + fread(&addr, 4, 1, (*param)->fp_i); + fseek((*param)->fp_i, addr, SEEK_SET); + fread(&addr, 4, 1, (*param)->fp_i); + fseek((*param)->fp_i, addr, SEEK_SET); + + if ((*param)->stat > 0) + printf("Advised document body address is %x\n", addr); + + cnki_pdf(param); + + if ((*param)->stat > 0) + printf("Conversion ended\n"); + + return 0; +} diff --git a/src/cnki_kdh.c b/src/cnki_kdh.c new file mode 100644 index 0000000..8441319 --- /dev/null +++ b/src/cnki_kdh.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "cnki.h" + +int +cnki_kdh(cnki_t **param) +{ + if (*param == NULL) + return 1; + + if ((*param)->stat > 0) + printf("Begin 'KDH' decryption\n"); + + fseek((*param)->fp_i, 0, SEEK_END); + + long size = ftell((*param)->fp_i); + + fseek((*param)->fp_i, ADDRESS_KDH_BODY, SEEK_SET); + + const char key[] = KEY_KDH; + const int key_len = KEY_KDH_LENGTH; + long key_cur = 0; + + char buf[(*param)->size_buf]; + + for (;;) { + fread(buf, (*param)->size_buf, 1, (*param)->fp_i); + + for (int i = 0; i < (*param)->size_buf; i++) { + buf[i] ^= key[key_cur % key_len]; + key_cur++; + } + + fwrite(buf, (*param)->size_buf, 1, (*param)->fp_o); + + if (ftell((*param)->fp_i) == size) + break; + } + + if ((*param)->stat > 0) + printf("Decryption ended total %ld byte(s) written\n", + ftell((*param)->fp_o)); + + return 0; +} diff --git a/src/cnki_nh.c b/src/cnki_nh.c new file mode 100644 index 0000000..7b9378f --- /dev/null +++ b/src/cnki_nh.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include "cnki.h" + +int +cnki_nh(cnki_t **param) +{ + if (*param == NULL) + return 1; + + if ((*param)->stat > 0) + printf("Begin 'HN' conversion\n"); + + if ((*param)->file_stat->page > 0) + (*param)->object_nh = malloc(sizeof(object_nh_t)); + else + return 1; + + if ((*param)->object_nh == NULL) + return 1; + + if ((*param)->stat > 1) { + printf("Loading page(s)\n"); + printf("\t%8s\t%8s\t%13s\t%6s\t%4s\t%8s\t%8s\n", + "address", + "text", + "page", + "zero", + "code", + "address", + "image"); + } + + object_nh_t *ptr = (*param)->object_nh; + for (int i = 0; i < (*param)->file_stat->page; i++) { + fread(&ptr->address, 4, 1, (*param)->fp_i); + fread(&ptr->size, 4, 1, (*param)->fp_i); + fread(&ptr->page, 4, 1, (*param)->fp_i); + fread(&ptr->zero, 8, 1, (*param)->fp_i); + + ptr->text = NULL; + ptr->image_format = -1; + ptr->image_address = 0; + ptr->image_size = 0; + ptr->image = NULL; + ptr->next = NULL; + + if (i < (*param)->file_stat->page - 1) { + ptr->next = malloc(sizeof(object_nh_t)); + + if (ptr->next == NULL) + return 1; + } + + ptr = ptr->next; + } + + ptr = (*param)->object_nh; + while (ptr != NULL) { + ptr->text = malloc(ptr->size); + + if (ptr->text == NULL) + return 1; + + fseek((*param)->fp_i, ptr->address, SEEK_SET); + fread(ptr->text, ptr->size, 1, (*param)->fp_i); + fread(&ptr->image_format, 4, 1, (*param)->fp_i); + fread(&ptr->image_address, 4, 1, (*param)->fp_i); + fread(&ptr->image_size, 4, 1, (*param)->fp_i); + + ptr->image = malloc(ptr->image_size); + + if (ptr->image == NULL) + return 1; + + fseek((*param)->fp_i, ptr->image_address, SEEK_SET); + fread(ptr->image, ptr->image_size, 1, (*param)->fp_i); + + if ((*param)->stat > 1) + printf("\t%08x\t%8d\t{%d, %8d}\t{%d, %d}\t%4d\t%08x\t%8d\n", + ptr->address, + ptr->size, + ptr->page[0], + ptr->page[1], + ptr->zero[0], + ptr->zero[1], + ptr->image_format, + ptr->image_address, + ptr->image_size); + + ptr = ptr->next; + } + + if ((*param)->stat > 1) + printf("Loaded %d page(s)\n", (*param)->file_stat->page); + + /* TODO: Study signed int __fastcall CAJDoc::OpenNHCAJFile(int a1, int a2) */ + + if ((*param)->stat > 0) + printf("Conversion ended\n"); + + /* TODO: Finish me please :) */ + return 1; +} diff --git a/src/cnki_outline_tree.c b/src/cnki_outline_tree.c new file mode 100644 index 0000000..7d16ddb --- /dev/null +++ b/src/cnki_outline_tree.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +#include "cnki.h" + +int +cnki_outline_tree(object_outline_tree_t **outline_tree, + object_outline_t **outline, int *ids) +{ + if (*outline_tree != NULL || *outline == NULL) + return 1; + + int pos = 0; + + *outline_tree = malloc(sizeof(object_outline_tree_t)); + + if (*outline_tree == NULL) + return 1; + + object_outline_tree_t *tree = *outline_tree; + + tree->id = ids[pos++]; + tree->item = NULL; + tree->up = NULL; + tree->left = NULL; + tree->right = NULL; + + object_outline_t *ptr = *outline; + while (ptr != NULL) { + if (tree->item == NULL || + ptr->depth == tree->item->depth) { + while (tree->left != NULL) + tree = tree->left; + + tree->left = malloc(sizeof(object_outline_tree_t)); + + if (tree->left == NULL) + return 1; + + tree->left->id = ids[pos++]; + tree->left->item = ptr; + tree->left->up = tree; + tree->left->left = NULL; + tree->left->right = NULL; + + tree = tree->left; + } else if (ptr->depth == tree->item->depth + 1) { + tree->right = malloc(sizeof(object_outline_tree_t)); + + if (tree->right == NULL) + return 1; + + tree->right->id = ids[pos++]; + tree->right->item = ptr; + tree->right->up = tree; + tree->right->left = NULL; + tree->right->right = NULL; + + tree = tree->right; + } else { + tree = tree->up; + continue; + } + ptr = ptr->next; + } + + return 0; +} diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c new file mode 100644 index 0000000..e56decb --- /dev/null +++ b/src/cnki_pdf.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include "cnki.h" +#include "pdf.h" +#include "pdf_cnki.h" + +int +cnki_pdf(cnki_t **param) +{ + if (*param == NULL) + return 1; + + pdf_object_t *pdf = NULL; + + if (pdf_obj_create(&pdf) != 0) + return 1; + + if ((*param)->stat > 0) + printf("Begin processing PDF\n"); + + if ((*param)->stat > 1) + printf("Loading object(s)\n"); + + if (pdf_load(&pdf, &(*param)->fp_i, (*param)->size_buf) != 0) + return 1; + + if ((*param)->stat > 1) { + printf("\t%8s\t%8s\t%8s\t%12s\t%12s\t%12s\n", + "address", + "size", + "id", + "object", + "dictionary", + "stream"); + + pdf_object_t *ptr = pdf->next; + while (ptr != NULL) { + printf("\t%08x\t%8d\t%8d\t%12d\t%12d\t%12d\n", + ptr->address, + ptr->size, + ptr->id, + ptr->object_size, + ptr->dictionary_size, + ptr->stream_size); + ptr = ptr->next; + } + } + + if ((*param)->stat > 0) + printf("Loaded %d object(s)\n", + pdf_get_count(&pdf)); + + if ((*param)->stat > 1) + printf("Searching for parent object(s)\n"); + + int *parent = NULL; + pdf_get_parent_id(&pdf, &parent); + + if (parent[0] == 0) + return 1; + + if ((*param)->stat > 0) + printf("Discovered %d parent object(s)\n", parent[0]); + + char buf[64]; + + int parent_missing[parent[0]]; + int *kid; + int dictionary_size; + char *dictionary; + + for (int i = 1; i <= parent[0]; i++) { + if ((*param)->stat > 1) + printf("Searching for object %d\n", parent[i]); + + kid = NULL; + pdf_get_kid_id(&pdf, parent[i], &kid); + + if (kid[0] != 0) { + if ((*param)->stat > 0) + printf("Object is missing\n"); + + if ((*param)->stat > 1) + printf("Generating object\n"); + + dictionary_size = 64 + 12 * kid[0]; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) + return 1; + + memset(dictionary, 0, dictionary_size); + + snprintf(buf, 64, + "<<\n/Type /Pages\n/Kids ["); + strcat(dictionary, buf); + for (int j = 1; j <= kid[0]; j++) { + snprintf(buf, 64, + "%d 0 R", + kid[j]); + strcat(dictionary, buf); + if (j < kid[0]) + strcat(dictionary, " "); + } + snprintf(buf, 64, + "]\n/Count %d\n>>\n", + pdf_get_kid_count(&pdf, parent[i])); + strcat(dictionary, buf); + + pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL); + + parent_missing[i - 1] = 1; + + if ((*param)->stat > 0) + printf("Generated object for %d child(ren)\n", + kid[0]); + + free(dictionary); + } else { + parent_missing[i - 1] = 0; + + if ((*param)->stat > 0) + printf("Object exists\n"); + } + + free(kid); + } + + if ((*param)->stat > 1) + printf("Searching for root object\n"); + + dictionary_size = 128; + dictionary = malloc(dictionary_size); + + if (dictionary == NULL) + return 1; + + memset(dictionary, 0, dictionary_size); + + int root = 0; + + int root_kid = 0; + for (int i = 0; i < parent[0]; i++) + if (parent_missing[i]) + root_kid++; + + if (root_kid <= 1) { + if (root_kid == 0) { + for (int i = 1; i <= parent[0]; i++) + if (root == 0 || root < parent[i]) + root = parent[i]; + } else { + for (int i = 0; i < parent[0]; i++) + if (parent_missing[i]) + root = i; + } + + if ((*param)->stat > 0) + printf("Root object is %d.\n", + root); + } else { + if ((*param)->stat > 0) + printf("Root object is missing\n"); + + if ((*param)->stat > 1) + printf("Generating root object\n"); + + root = pdf_get_free_id(&pdf); + + snprintf(buf, 64, + "<<\n/Type /Pages\n/Kids "); + strcat(dictionary, buf); + + if (parent[0] > 1) + strcat(dictionary, "["); + + for (int i = 0; i < parent[0]; i++) { + if (parent_missing[i]) { + snprintf(buf, 64, "%d 0 R", parent[i + 1]); + strcat(dictionary, buf); + if (i < root_kid) + strcat(dictionary, " "); + } + } + + if (parent[0] > 1) + strcat(dictionary, "]"); + + strcat(dictionary, "\n"); + + snprintf(buf, 64, "/Count %d\n", (*param)->file_stat->page); + strcat(dictionary, buf); + + strcat(dictionary, ">>\n"); + + pdf_obj_prepend(&pdf, root, NULL, dictionary, NULL); + + memset(dictionary, 0, dictionary_size); + + if ((*param)->stat > 0) + printf("Generated root object %d.\n", + root); + } + + int *ids = NULL; + + if ((*param)->file_stat->outline > 0) { + if ((*param)->stat > 1) + printf("Generating outline object(s)\n\t%8s\n", "id"); + + pdf_get_free_ids(&pdf, &ids, (*param)->file_stat->outline + 1); + int outline = pdf_cnki_outline(&pdf, &(*param)->object_outline, &ids); + + if ((*param)->stat > 1) + for (int i = 0; i < (*param)->file_stat->outline + 1; i++) + printf("\t%8d\n", ids[i]); + + if ((*param)->stat > 0) { + if (outline != 0) + printf("No outline information\n"); + else + printf("Generated %d outline object(s)\n", + (*param)->file_stat->outline + 1); + } + } + + if ((*param)->stat > 1) + printf("Generating '/Catalog' dictionary\n"); + + snprintf(buf, 64, + "<<\n/Type /Catalog\n/Pages %d 0 R\n", + root); + strcat(dictionary, buf); + + if (ids != NULL) { + snprintf(buf, 64, + "/Outlines %d 0 R\n/PageMode /UseOutlines\n", + ids[0]); + strcat(dictionary, buf); + } + + strcat(dictionary, ">>\n"); + + pdf_obj_append(&pdf, 0, NULL, dictionary, NULL); + + free(dictionary); + + if ((*param)->stat > 0) + printf("Generated '/Catalog' dictionary\n"); + + if ((*param)->stat > 1) + printf("Sorting object(s)\n"); + + pdf_obj_sort(&pdf); + + if ((*param)->stat > 0) + printf("Sorted object(s)\n"); + + if ((*param)->stat > 1) + printf("Writing header\n"); + + long cur = 0; + + if ((*param)->stat > 0) + cur = ftell((*param)->fp_o); + + if (pdf_dump_header(&pdf, &(*param)->fp_o) != 0) { + fprintf(stderr, "Header not written\n"); + return 1; + } else { + if ((*param)->stat > 0) + printf("Header %ld byte(s) written\n", + ftell((*param)->fp_o) - cur); + } + + if ((*param)->stat > 1) + printf("Writing object(s)\n"); + + pdf_dump_obj(&pdf, &(*param)->fp_o); + + if ((*param)->stat > 1) { + printf("\t%8s\t%8s\t%8s\t%12s\t%12s\t%12s\n", + "address", + "size", + "id", + "object", + "dictionary", + "stream"); + + pdf_object_t *ptr = pdf->next; + while (ptr != NULL) { + printf("\t%08x\t%8d\t%8d\t%12d\t%12d\t%12d\n", + ptr->address, + ptr->size, + ptr->id, + ptr->object_size, + ptr->dictionary_size, + ptr->stream_size); + ptr = ptr->next; + } + } + + if ((*param)->stat > 0) + printf("%d object(s) %ld byte(s) written\n", + pdf_get_count(&pdf), + ftell((*param)->fp_o)); + + long xref = ftell((*param)->fp_o); + + if ((*param)->stat > 1) + printf("Writing cross-reference table\n"); + + if (pdf_dump_xref(&pdf, &(*param)->fp_o) != 0) { + if ((*param)->stat > 0) + printf("Cross-reference table not written\n"); + } else { + if ((*param)->stat > 0) + printf("Cross-reference table %ld byte(s) written\n", + ftell((*param)->fp_o) - xref); + } + + if ((*param)->stat > 1) + printf("Writing trailer\n"); + + if ((*param)->stat > 0) + cur = ftell((*param)->fp_o); + + if (pdf_dump_trailer(&pdf, &(*param)->fp_o, xref) != 0) { + if ((*param)->stat > 0) + printf("Trailer not written\n"); + } else { + if ((*param)->stat > 0) + printf("Trailer %ld byte(s) written\n", + ftell((*param)->fp_o) - cur); + } + + if ((*param)->stat > 0) + printf("Total %ld byte(s) written\n", + ftell((*param)->fp_o)); + + pdf_obj_destroy(&pdf); + + return 0; +} diff --git a/src/cnki_xml.c b/src/cnki_xml.c new file mode 100644 index 0000000..7933738 --- /dev/null +++ b/src/cnki_xml.c @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +int +cnki_xml(char **xml, FILE **fp) +{ + /* TODO: Extract XML and embed into `/Metadata' */ + return 1; +} diff --git a/src/extern.h b/src/extern.h new file mode 100644 index 0000000..b7abc6e --- /dev/null +++ b/src/extern.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "cnki.h" + +/* cnki.c */ +int cnki_create(cnki_t **param); +void cnki_destroy(cnki_t **param); +int cnki_info(cnki_t **param); + +/* cnki_caj.c */ +int cnki_caj(cnki_t **param); + +/* cnki_nh.c */ +int cnki_nh(cnki_t **param); + +/* cnki_kdh.c */ +int cnki_kdh(cnki_t **param); diff --git a/src/iconv.c b/src/iconv.c new file mode 100644 index 0000000..1bf4d94 --- /dev/null +++ b/src/iconv.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include + +/* So, why would anyone use something other than UTF-8? */ +int +strconv(char **dst, + const char * restrict dst_code, + const char * restrict src, + const char * restrict src_code, + int *size) +{ + size_t dst_size = *size; + char *dst_conv = malloc(dst_size); + + if (dst_conv == NULL) + return 1; + + size_t src_size = strlen(src) + 1; + char *src_conv = malloc(src_size); + + if (src_conv == NULL) { + free(dst_conv); + return 1; + } + + strncpy(src_conv, src, src_size); + + char *dst_start = dst_conv; + char *src_start = src_conv; + + iconv_t conv_src_dst = iconv_open(dst_code, src_code); + + if (conv_src_dst == (iconv_t) - 1) { + free(dst_conv); + free(src_conv); + return 1; + } + + if (iconv(conv_src_dst, + &src_conv, &src_size, + &dst_conv, &dst_size) == (size_t) - 1) { + free(dst_start); + free(src_start); + return 1; + } else { + /* Not including NULL */ + *size -= dst_size + 2; + + *dst = malloc(*size); + + if (*dst != NULL) + memcpy(*dst, dst_start, *size); + + free(dst_start); + free(src_start); + } + + if (iconv_close(conv_src_dst) != 0 || *dst == NULL) + return 1; + + return 0; +} diff --git a/src/iconv.h b/src/iconv.h new file mode 100644 index 0000000..81af034 --- /dev/null +++ b/src/iconv.h @@ -0,0 +1,12 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +int +strconv(char **dst, + const char * restrict dst_code, + const char * restrict src, + const char * restrict src_code, + int *size); diff --git a/src/melon.c b/src/melon.c new file mode 100644 index 0000000..d1a08d1 --- /dev/null +++ b/src/melon.c @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include +#include + +#include "extern.h" +#include "version.h" + +int +main(int argc, char **argv, char **envp) +{ + printf("Melon " VERSION "." RELEASE "." PATCH EXTRA "\n"); + printf("Copyright (c) 2020, yzrh \n\n"); + + cnki_t *param = NULL; + + if (cnki_create(¶m) != 0) { + fprintf(stderr, "%s: %s\n", argv[0], strerror(errno)); + return EXIT_FAILURE; + } + + int c; + + for (;;) { + static struct option long_options[] = { + {"output", required_argument, 0, 'o'}, + {"buffer", required_argument, 0, 'b'}, + {"verbose", no_argument, 0, 'v'}, + {0, 0, 0, 0} + }; + + int option_index = 0; + + c = getopt_long(argc, argv, "o:b:v", + long_options, &option_index); + + if (c == -1) + break; + + switch (c) { + case 'o': + if ((param->fp_o = fopen(optarg, "w")) == NULL) { + fprintf(stderr, "%s: %s\n", argv[0], + strerror(errno)); + return EXIT_FAILURE; + } + break; + case 'b': + param->size_buf = atoi(optarg); + break; + case 'v': + param->stat += 1; + break; + case '?': + break; + default: + abort(); + } + } + + if (argc - optind == 1) { + if (param->fp_o == NULL) { + if (param->stat == 0) { + param->fp_o = stdout; + } else { + fprintf(stderr, "%s: --verbose ", argv[0]); + fprintf(stderr, "must not be set "); + fprintf(stderr, "when using stdout\n"); + return EXIT_FAILURE; + } + } + + if ((param->fp_i = fopen(argv[optind], "r")) == NULL) { + fprintf(stderr, "%s: %s\n", argv[0], + strerror(errno)); + return EXIT_FAILURE; + } + + cnki_info(¶m); + + if (strcmp(param->file_stat->type, "%PDF") == 0) { + if (cnki_pdf(¶m) != 0) { + fprintf(stderr, "%s: %s\n", argv[0], + strerror(errno)); + return EXIT_FAILURE; + } + } else if (strcmp(param->file_stat->type, "CAJ") == 0) { + if (cnki_caj(¶m) != 0) { + fprintf(stderr, "%s: %s\n", argv[0], + strerror(errno)); + return EXIT_FAILURE; + } + } else if (strcmp(param->file_stat->type, "HN") == 0) { + if (cnki_nh(¶m) != 0) { + fprintf(stderr, "%s: %s\n", argv[0], + strerror(errno)); + return EXIT_FAILURE; + } + } else if (strcmp(param->file_stat->type, "KDH ") == 0) { + if (cnki_kdh(¶m) != 0) { + fprintf(stderr, "%s: %s\n", argv[0], + strerror(errno)); + return EXIT_FAILURE; + } + } else { + fprintf(stderr, "%s: %s\n", argv[0], + "Invalid file"); + return EXIT_FAILURE; + } + + fclose(param->fp_i); + fclose(param->fp_o); + } else { + fprintf(stderr, "Usage: %s ", argv[0]); + fprintf(stderr, "[--output --buffer --verbose] file\n"); + return EXIT_FAILURE; + } + + cnki_destroy(¶m); +} diff --git a/src/pdf.c b/src/pdf.c new file mode 100644 index 0000000..92dd717 --- /dev/null +++ b/src/pdf.c @@ -0,0 +1,228 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include "pdf.h" + +static int +_min_id(pdf_object_t **pdf) +{ + int min = 0; + + pdf_object_t *ptr = (*pdf)->next; + while (ptr != NULL) { + if (min == 0 || ptr->id < min) + min = ptr->id; + + ptr = ptr->next; + } + + return min; +} + +int +pdf_obj_create(pdf_object_t **pdf) +{ + if (*pdf != NULL) + return 1; + + *pdf = malloc(sizeof(pdf_object_t)); + + if (*pdf == NULL) + return 1; + + (*pdf)->address = 0; + (*pdf)->size = 0; + (*pdf)->id = 0; + (*pdf)->object_size = 0; + (*pdf)->object = NULL; + (*pdf)->dictionary_size = 0; + (*pdf)->dictionary = NULL; + (*pdf)->stream_size = 0; + (*pdf)->stream= NULL; + (*pdf)->next = NULL; + + return 0; +} + +void +pdf_obj_destroy(pdf_object_t **pdf) +{ + pdf_object_t *ptr; + while ((ptr = *pdf) != NULL) { + *pdf = (*pdf)->next; + free(ptr->object); + free(ptr->dictionary); + free(ptr->stream); + free(ptr); + } +} + +int +pdf_obj_add(pdf_object_t **pdf, int id, + const char * restrict object, + const char * restrict dictionary, + const char * restrict stream) +{ + if (*pdf != NULL || id <= 0 || + (object != NULL && dictionary != NULL)) + return 1; + + *pdf = malloc(sizeof(pdf_object_t)); + + if (*pdf == NULL) + return 1; + + (*pdf)->address = 0; + (*pdf)->size = 0; + + (*pdf)->id = id; + + if (dictionary != NULL) { + (*pdf)->dictionary_size = strlen(dictionary) + 1; + (*pdf)->dictionary = malloc((*pdf)->dictionary_size); + + if ((*pdf)->dictionary == NULL) + return 1; + + strncpy((*pdf)->dictionary, dictionary, (*pdf)->dictionary_size); + + (*pdf)->object_size = 0; + (*pdf)->object = NULL; + } else if (object != NULL) { + (*pdf)->object_size = strlen(object) + 1; + (*pdf)->object = malloc((*pdf)->object_size); + + if ((*pdf)->object == NULL) + return 1; + + strncpy((*pdf)->object, object, (*pdf)->object_size); + + (*pdf)->dictionary_size = 0; + (*pdf)->dictionary = NULL; + } else { + (*pdf)->object_size = 0; + (*pdf)->object = NULL; + (*pdf)->dictionary_size = 0; + (*pdf)->dictionary = NULL; + } + + if (stream != NULL) { + (*pdf)->stream_size = sizeof(stream); + (*pdf)->stream = malloc((*pdf)->stream_size); + + if ((*pdf)->stream == NULL) + return 1; + + memcpy((*pdf)->stream, stream, (*pdf)->stream_size); + } else { + (*pdf)->stream_size = 0; + (*pdf)->stream = NULL; + } + + (*pdf)->next = NULL; + + return 0; +} + +int +pdf_obj_del(pdf_object_t **pdf, int id) +{ + if (*pdf == NULL || id <= 0) + return 1; + + pdf_object_t *ptr = *pdf; + while (ptr->next != NULL) { + if (ptr->next->id == id) { + ptr->next = ptr->next->next; + break; + } + + ptr = ptr->next; + } + + return 0; +} + +int +pdf_obj_prepend(pdf_object_t **pdf, int id, + const char * restrict object, + const char * restrict dictionary, + const char * restrict stream) +{ + if (*pdf == NULL) + return 1; + + if (id <= 0) + id = pdf_get_free_id(pdf); + + pdf_object_t *ptr = NULL; + + if (pdf_obj_add(&ptr, id, object, dictionary, stream) != 0) { + free(ptr); + return 1; + } + + ptr->next = (*pdf)->next; + (*pdf)->next = ptr; + + return 0; +} + +int +pdf_obj_append(pdf_object_t **pdf, int id, + const char * restrict object, + const char * restrict dictionary, + const char * restrict stream) +{ + if (*pdf == NULL) + return 1; + + if (id <= 0) + id = pdf_get_free_id(pdf); + + pdf_object_t *ptr = *pdf; + while (ptr->next != NULL) + ptr = ptr->next; + + if (pdf_obj_add(&ptr->next, id, object, dictionary, stream) != 0) + return 1; + + return 0; +} + +int +pdf_obj_sort(pdf_object_t **pdf) +{ + if (*pdf == NULL) + return 1; + + int id; + pdf_object_t *tmp; + pdf_object_t *ptr; + + ptr = *pdf; + while (ptr->next != NULL) { + id = _min_id(&ptr->next); + + if (id == 0) + return 1; + + if (id < ptr->next->id) { + pdf_get_obj(&ptr->next, id, &tmp); + pdf_obj_del(&ptr->next, id); + + tmp->next = ptr->next; + ptr->next = tmp; + } + + ptr = ptr->next; + } + + return 0; +} diff --git a/src/pdf.h b/src/pdf.h new file mode 100644 index 0000000..61f64d5 --- /dev/null +++ b/src/pdf.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +typedef struct _pdf_object_t { + int address; + int size; + int id; + int object_size; + char *object; + int dictionary_size; + char *dictionary; + int stream_size; + char *stream; + struct _pdf_object_t *next; +} pdf_object_t; + +/* pdf.c */ +/* TODO: Rewrite object dictionary */ +/* TODO: Compact object id */ +/* TODO: `mutool clean -gggsz' */ +int pdf_obj_create(pdf_object_t **pdf); +void pdf_obj_destroy(pdf_object_t **pdf); +int pdf_obj_add(pdf_object_t **pdf, int id, + const char * restrict object, + const char * restrict dictionary, + const char * restrict stream); +int pdf_obj_del(pdf_object_t **pdf, int id); +int pdf_obj_prepend(pdf_object_t **pdf, int id, + const char * restrict object, + const char * restrict dictionary, + const char * restrict stream); +int pdf_obj_append(pdf_object_t **pdf, int id, + const char * restrict object, + const char * restrict dictionary, + const char * restrict stream); +int pdf_obj_sort(pdf_object_t **pdf); + +/* pdf_parser.c */ +int pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf); + +/* pdf_writer.c */ +int pdf_dump_obj(pdf_object_t **pdf, FILE **fp); +int pdf_dump_header(pdf_object_t **pdf, FILE **fp); +int pdf_dump_xref(pdf_object_t **pdf, FILE **fp); +int pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref); + +/* pdf_get.c */ +int pdf_get_obj(pdf_object_t **pdf, int id, pdf_object_t **obj); +int pdf_get_count(pdf_object_t **pdf); +int pdf_get_size(pdf_object_t **pdf); +int pdf_get_free_id(pdf_object_t **pdf); +int pdf_get_free_ids(pdf_object_t **pdf, int **ids, int count); +int pdf_get_catalog_id(pdf_object_t **pdf); +int pdf_get_parent_id(pdf_object_t **pdf, int **id); +int pdf_get_kid_id(pdf_object_t **pdf, int id, int **kid); +int pdf_get_kid_count(pdf_object_t **pdf, int id); diff --git a/src/pdf_cnki.c b/src/pdf_cnki.c new file mode 100644 index 0000000..2dec0b6 --- /dev/null +++ b/src/pdf_cnki.c @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include "cnki.h" +#include "iconv.h" +#include "pdf.h" + +/* + * It will write first, list, and count to *stat + * so that when called recursively, it knows + * what to do + */ +static int +_outline(pdf_object_t **pdf, object_outline_tree_t **outline_tree, int id, int **stat) +{ + *stat = malloc(3 * sizeof(int)); + + if (*stat == NULL) + return 1; + + int size; + char *str; + + int *ret; + + char buf[64]; + char dictionary[1024]; + + object_outline_tree_t *ptr = *outline_tree; + + (*stat)[0] = ptr->id; + (*stat)[2] = 0; + + while (ptr != NULL) { + memset(dictionary, 0, 1024); + + strcat(dictionary, "<<\n"); + + size = 512; + str = NULL; + + if (strconv(&str, "UTF-16BE", + ptr->item->title, "GB18030", + &size) == 0) { + strcat(dictionary, "/Title \n"); + } + + free(str); + + snprintf(buf, 64, "/Parent %d 0 R\n", id); + strcat(dictionary, buf); + + if (ptr->up != NULL && ptr->up->id != id) { + snprintf(buf, 64, "/Prev %d 0 R\n", ptr->up->id); + strcat(dictionary, buf); + } + + if (ptr->left != NULL) { + snprintf(buf, 64, "/Next %d 0 R\n", ptr->left->id); + strcat(dictionary, buf); + } + + if (ptr->right != NULL) { + _outline(pdf, &ptr->right, ptr->id, &ret); + + snprintf(buf, 64, "/First %d 0 R\n", ret[0]); + strcat(dictionary, buf); + + snprintf(buf, 64, "/Last %d 0 R\n", ret[1]); + strcat(dictionary, buf); + + snprintf(buf, 64, "/Count -%d\n", ret[2]); + strcat(dictionary, buf); + + free(ret); + } + + /* Page starts from 0 */ + snprintf(buf, 64, "/Dest [%d /XYZ null null null]\n>>\n", + atoi(ptr->item->page) - 1); + strcat(dictionary, buf); + + pdf_obj_append(pdf, ptr->id, NULL, dictionary, NULL); + + if (ptr->left == NULL) + (*stat)[1] = ptr->id; + + (*stat)[2]++; + + ptr = ptr->left; + } + + return 0; +} + +int +pdf_cnki_outline(pdf_object_t **pdf, object_outline_t **outline, int **ids) +{ + if (*pdf == NULL || *outline == NULL || *ids == NULL) + return 1; + + object_outline_tree_t *outline_tree = NULL; + cnki_outline_tree(&outline_tree, outline, *ids); + + char buf[128]; + int *ret; + + _outline(pdf, &outline_tree->left, outline_tree->id, &ret); + + free(outline_tree); + + snprintf(buf, 128, + "<<\n/Type Outlines\n/First %d 0 R\n/Last %d 0 R\n/Count %d\n>>\n", + ret[0], ret[1], ret[2]); + + free(ret); + + pdf_obj_append(pdf, (*ids)[0], NULL, buf, NULL); + + return 0; +} diff --git a/src/pdf_cnki.h b/src/pdf_cnki.h new file mode 100644 index 0000000..f0210d0 --- /dev/null +++ b/src/pdf_cnki.h @@ -0,0 +1,7 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +int pdf_cnki_outline(pdf_object_t **pdf, object_outline_t **outline, int **ids); diff --git a/src/pdf_get.c b/src/pdf_get.c new file mode 100644 index 0000000..33fb271 --- /dev/null +++ b/src/pdf_get.c @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include + +#include "pdf.h" + +static int +_id_in(int id, int *ids) +{ + for (int i = 1; i <= ids[0]; i++) + if (ids[i] == id) + return 1; + + return 0; +} + +int +pdf_get_obj(pdf_object_t **pdf, int id, pdf_object_t **obj) +{ + if (*pdf == NULL || id <= 0) + return 1; + + pdf_object_t *ptr = *pdf; + while (ptr->next != NULL) { + if (ptr->next->id == id) { + *obj = ptr->next; + return 0; + } + ptr = ptr->next; + } + + return 1; +} + +int +pdf_get_count(pdf_object_t **pdf) +{ + if (*pdf == NULL) + return 1; + + int count = 0; + + pdf_object_t *ptr = (*pdf)->next; + while (ptr != NULL) { + count++; + ptr = ptr->next; + } + + return count; +} + +int +pdf_get_size(pdf_object_t **pdf) +{ + if (*pdf == NULL) + return 1; + + int size = 0; + + pdf_object_t *ptr = (*pdf)->next; + while (ptr != NULL) { + size += ptr->size; + ptr = ptr->next; + } + + return size; +} + +int +pdf_get_free_id(pdf_object_t **pdf) +{ + if (*pdf == NULL) + return 1; + + int free_id = 0; + + pdf_object_t *ptr; + + int id = 0; + + for (int i = 1; i < 99999999; i++) { + ptr = (*pdf)->next; + while (ptr != NULL) { + if (ptr->id == i) { + id = i; + break; + } + ptr = ptr->next; + } + + if (i != id) { + free_id = i; + break; + } + } + + return free_id; +} + +int +pdf_get_free_ids(pdf_object_t **pdf, int **ids, int count) +{ + if (*pdf == NULL || *ids != NULL || count <= 0) + return 1; + + *ids = malloc(count * sizeof(int)); + + if (*ids == NULL) + return 1; + + int pos = 0; + int id = 0; + + pdf_object_t *ptr; + for (int i = 1; i < 99999999; i++) { + ptr = (*pdf)->next; + while (ptr != NULL) { + if (ptr->id == i) { + id = i; + break; + } + ptr = ptr->next; + } + + if (i != id) { + (*ids)[pos] = i; + + if (pos == count) + return 0; + + pos++; + } + } + + return 1; +} + +int +pdf_get_catalog_id(pdf_object_t **pdf) +{ + if (*pdf == NULL) + return 1; + + int catalog_id = 0; + + pdf_object_t *ptr = (*pdf)->next; + + while (ptr != NULL) { + if (ptr->dictionary != NULL && + strstr(ptr->dictionary, "/Catalog") != NULL) + catalog_id = ptr->id; + + ptr = ptr->next; + } + + return catalog_id; +} + +int +pdf_get_parent_id(pdf_object_t **pdf, int **id) +{ + if (*pdf == NULL || *id != NULL) + return 1; + + int id_size = 1; + *id = malloc(sizeof(int)); + + if (*id == NULL) + return 1; + + (*id)[0] = 0; + + pdf_object_t *ptr = (*pdf)->next; + + char *head; + char *tail; + + char str[8]; + int str_val; + + int *ret; + + while (ptr != NULL) { + if (ptr->dictionary != NULL && + (head = strstr(ptr->dictionary, "/Parent ")) != NULL && + (tail = strchr(head + 8, ' ')) != NULL) { + memset(str, 0, 8); + strncpy(str, head + 8, (tail - head) - 8); + str_val = atoi(str); + + if (!_id_in(str_val, *id)) { + ret = realloc(*id, ++id_size * sizeof(int)); + + if (ret == NULL) + return 1; + else + *id = ret; + + (*id)[0]++; + (*id)[id_size - 1] = str_val; + } + } + ptr = ptr->next; + } + + return 0; +} + +int +pdf_get_kid_id(pdf_object_t **pdf, int id, int **kid) +{ + if (*pdf == NULL || *kid != NULL) + return 1; + + int kid_size = 1; + *kid = malloc(sizeof(int)); + + if (*kid == NULL) + return 1; + + pdf_object_t *ptr = (*pdf)->next; + + char str[32]; + int *ret; + + snprintf(str, 32, "/Parent %d 0 R", id); + + while (ptr != NULL) { + if (ptr->id == id) { + (*kid)[0] = 0; + return 1; + } + + if (ptr->dictionary != NULL && + strstr(ptr->dictionary, str) != NULL) { + ret = realloc(*kid, ++kid_size * sizeof(int)); + + if (ret == NULL) + return 1; + else + *kid = ret; + + (*kid)[kid_size - 1] = ptr->id; + } + + ptr = ptr->next; + } + + (*kid)[0] = kid_size - 1; + + return 0; +} + +int +pdf_get_kid_count(pdf_object_t **pdf, int id) +{ + if (*pdf == NULL || id <= 0) + return 1; + + int count = 0; + + pdf_object_t *ptr = (*pdf)->next; + + char id_str[32]; + char *pos; + + char str[8]; + int str_val; + + snprintf(id_str, 32, "/Parent %d 0 R", id); + + while (ptr != NULL) { + if (ptr->dictionary != NULL && + strstr(ptr->dictionary, id_str) != NULL && + (pos = strstr(ptr->dictionary, "/Count ")) != NULL) { + for (int i = 8; i >= 0; i--) { + if (i + 7 <= ptr->dictionary_size - (pos - ptr->dictionary) && + pos[i + 7] >= '0' && pos[i + 7] <= '9') { + memset(str, 0, 8); + strncpy(str, pos + 7, i + 1); + str_val = atoi(str); + count += str_val; + break; + } + } + } + ptr = ptr->next; + } + + return count; +} diff --git a/src/pdf_parser.c b/src/pdf_parser.c new file mode 100644 index 0000000..1da8dff --- /dev/null +++ b/src/pdf_parser.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifdef __linux__ + +#define _GNU_SOURCE + +#endif /* __linux__ */ + +#include +#include + +#include "pdf.h" + +static void * +_memmem_whitespace(const void *p0, size_t s0, const void *p1, size_t s1) +{ + const char whitespace[6] = { + '\r', + '\n', + '\f', + '\t', + '\0', + ' ' + }; + + char tmp[s1 + 1]; + memcpy(tmp, p1, s1); + + char *ret; + + for (int i = 0; i < 6; i++) { + tmp[s1] = whitespace[i]; + if((ret = memmem(p0, s0, tmp, s1 + 1)) != NULL) + return ret; + } + + return NULL; +} + +static int +_locate(pdf_object_t **pdf, FILE **fp, int size_buf) +{ + pdf_object_t *ptr = *pdf; + while (ptr->next != NULL) + ptr = ptr->next; + + char buf[size_buf]; + + long cur = ftell(*fp); + long end; + + fseek(*fp, 0, SEEK_END); + end = ftell(*fp); + fseek(*fp, cur, SEEK_SET); + + int head = 0; + int tail = 0; + char *pos; + char *tmp; + + for (;;) { + fread(buf, size_buf, 1, *fp); + + if (head == 0 && (pos = _memmem_whitespace(buf, size_buf, " 0 obj", 6)) != NULL) + head = cur + (pos - buf) + 7; + + if (tail == 0 && (pos = _memmem_whitespace(buf, size_buf, "endobj", 6)) != NULL) { + /* We need to check if it is the object stored in stream */ + while (memcmp(pos + 7, + "\r\nendstream", 11) == 0 && + (tmp = _memmem_whitespace(pos + 6, + size_buf - (pos - buf) - 6, + "endobj", 6)) != NULL) + pos = tmp; + + if (pos - buf < size_buf - 7) + tail = cur + (pos - buf); + } + + if (tail > head) { + if (ptr->next == NULL) { + ptr->next = malloc(sizeof(pdf_object_t)); + + if (ptr->next == NULL) + return 1; + + ptr->next->id = 0; + ptr->next->object_size = 0; + ptr->next->object = NULL; + ptr->next->dictionary_size = 0; + ptr->next->dictionary = NULL; + ptr->next->stream_size = 0; + ptr->next->stream = NULL; + ptr->next->next = NULL; + ptr = ptr->next; + } + + ptr->address = head; + ptr->size = tail - head; + + fseek(*fp, tail + 6, SEEK_SET); + head = tail = 0; + } else { + fseek(*fp, -6, SEEK_CUR); + } + + if ((cur = ftell(*fp)) + 6 >= end) + break; + } + + return 0; +} + +int +pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) +{ + if (*pdf == NULL || *fp == NULL || size_buf < 7) + return 1; + + if (_locate(pdf, fp, size_buf) != 0) + return 1; + + pdf_object_t *ptr = (*pdf)->next; + + char *buf; + char *head; + char *tail; + char *tmp; + + while (ptr != NULL) { + buf = malloc(ptr->size); + + if (buf == NULL) + return 1; + + memset(buf, 0, ptr->size); + + fseek(*fp, ptr->address - 12, SEEK_SET); + fread(buf, 8, 1, *fp); + + for (int i = 0; i < 8; i++) { + if (buf[i] >= '0' && buf[i] <= '9') { + ptr->id = atoi(buf + i); + break; + } + } + + fseek(*fp, ptr->address, SEEK_SET); + fread(buf, ptr->size, 1, *fp); + + if ((head = memmem(buf, ptr->size, "<<", 2)) != NULL && + (tail = _memmem_whitespace(buf, ptr->size, ">>", 2)) != NULL) { + /* A dictionary object may have nested dictionary */ + while ((tmp = _memmem_whitespace(tail + 2, + ptr->size - (tail - buf) - 2, + ">>", 2)) != NULL) + tail = tmp; + + ptr->dictionary_size = tail - head + 2; + ptr->dictionary = malloc(ptr->dictionary_size + 1); + + if (ptr->dictionary == NULL) + return 1; + + memset(ptr->dictionary, 0, ptr->dictionary_size + 1); + memcpy(ptr->dictionary, head, ptr->dictionary_size); + + if ((head = memmem(tail, + ptr->size - (tail - buf), + "stream\r\n", 8)) != NULL && + (tail = _memmem_whitespace(head, + ptr->size - (head - buf), + "endstream", 9)) != NULL) { + /* + * An object may contain a stream that + * contains another object that + * contains another stream + */ + while (_memmem_whitespace(tail, + ptr->size - (tail - buf), + "endobj", 6) != NULL && + (tmp = _memmem_whitespace(tail + 9, + ptr->size - (tail - buf) - 9, + "endstream", 9)) != NULL) + tail = tmp; + + ptr->stream_size = (tail - head) - 8; + ptr->stream = malloc(ptr->stream_size); + + if (ptr->stream == NULL) + return 1; + + memcpy(ptr->stream, head + 8, ptr->stream_size); + } + } else { + ptr->object_size = ptr->size; + ptr->object = malloc(ptr->object_size + 1); + + if (ptr->object == NULL) + return 1; + + memset(ptr->object, 0, ptr->object_size + 1); + memcpy(ptr->object, buf, ptr->object_size); + } + + free(buf); + + ptr = ptr->next; + } + + return 0; +} diff --git a/src/pdf_writer.c b/src/pdf_writer.c new file mode 100644 index 0000000..3cf4f7c --- /dev/null +++ b/src/pdf_writer.c @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include + +#include + +#include "pdf.h" + +int +pdf_dump_obj(pdf_object_t **pdf, FILE **fp) +{ + if (*pdf == NULL || *fp == NULL) + return 1; + + long cur; + + pdf_object_t *ptr = (*pdf)->next; + while (ptr != NULL) { + ptr->address = cur = ftell(*fp); + + fprintf(*fp, "%d 0 obj\n", ptr->id); + + if (ptr->dictionary != NULL) + fputs(ptr->dictionary, *fp); + else if (ptr->object != NULL) + fputs(ptr->object, *fp); + else if (ptr->stream == NULL) + fputs("null\n", *fp); + + if (ptr->stream != NULL) { + fputs("stream\r\n", *fp); + fwrite(ptr->stream, ptr->stream_size, 1, *fp); + fputs("endstream\n", *fp); + } + + fputs("endobj\n", *fp); + + ptr->size = ftell(*fp) - cur; + + ptr = ptr->next; + } + + return 0; +} + +int +pdf_dump_header(pdf_object_t **pdf, FILE **fp) +{ + if (*pdf == NULL || *fp == NULL) + return 1; + + fputs("%PDF-1.7\n", *fp); + + const unsigned char bin[4] = { + 0xf6, + 0xe4, + 0xfc, + 0xdf, + }; + + fputs("%", *fp); + fwrite(bin, 4, 1, *fp); + fputs("\n", *fp); + + return 0; +} + +int +pdf_dump_xref(pdf_object_t **pdf, FILE **fp) +{ + if (*pdf == NULL || *fp == NULL) + return 1; + + fputs("xref\n", *fp); + + pdf_object_t *ptr = *pdf; + + pdf_object_t *start = ptr; + int count = 1; + + while (ptr != NULL) { + if (ptr->next == NULL || + (ptr->next != NULL && ptr->next->id != ptr->id + 1)) { + fprintf(*fp, "%d %d\n", start->id, count); + + for (; count > 0; count--) { + fprintf(*fp, "%010d %05d %s\r\n", + start->address, + start->address > 0 ? 0 : 65535, + start->size > 0 ? "n" : "f"); + start = start->next; + } + + if (ptr->next != NULL) + start = ptr->next; + } + + ptr = ptr->next; + count++; + } + + return 0; +} + +int +pdf_dump_trailer(pdf_object_t **pdf, FILE **fp, int xref) +{ + if (*pdf == NULL || *fp == NULL) + return 1; + + fputs("trailer\n", *fp); + + fputs("<<\n", *fp); + + /* + * File identifiers should be generated using + * (a) Current time + * (b) File path + * (c) Size of file + * (d) Values of all entries in the + * file's document information dictionary + * + * It is recommended to be computed according to RFC 1321 + */ + + time_t timestamp = time(NULL); + int size = pdf_get_size(pdf); + + int buf_size; + char buf[64]; + + buf_size = snprintf(buf, 64, "%lx%x", timestamp, size); + + unsigned char str[64]; + memcpy(str, buf, 64); + + unsigned char fid[MD5_DIGEST_LENGTH]; + MD5(str, buf_size, fid); + + pdf_object_t *ptr = *pdf; + while (ptr->next != NULL) + ptr = ptr->next; + + /* + * TODO: Document information dictionary + * `"/Producer (Melon)"' + * `"/CreationDate (D:YYYYMMDDHHmmSS+00'00')"' + * + * Trailer dictionary + * `"/Info %d 0 R"' + */ + fprintf(*fp, + "/Size %d\n/Root %d 0 R\n", + ptr->id + 1, + pdf_get_catalog_id(pdf)); + + fputs("/ID [", *fp); + + for (int i = 0; i < 2; i++) { + fputs("<", *fp); + + for (int j = 0; j < MD5_DIGEST_LENGTH; j++) + fprintf(*fp, "%02x", fid[j]); + + fputs(">", *fp); + + if (i < 1) + fputs(" ", *fp); + } + + fputs("]\n", *fp); + + fputs(">>\n", *fp); + + fputs("startxref\n", *fp); + + fprintf(*fp, "%d\n", xref); + + fputs("%%EOF\n", *fp); + + return 0; +} diff --git a/src/version.h b/src/version.h new file mode 100644 index 0000000..3773617 --- /dev/null +++ b/src/version.h @@ -0,0 +1,10 @@ +/* + * Copyright (c) 2020, yzrh + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#define VERSION "0" +#define RELEASE "1" +#define PATCH "0" +#define EXTRA "" -- cgit v1.2.3