diff options
author | yzrh <yzrh@noema.org> | 2020-12-30 03:09:00 +0000 |
---|---|---|
committer | yzrh <yzrh@noema.org> | 2020-12-30 03:09:00 +0000 |
commit | 98691d4203f4e578b84b2014db0fbe0c1209cc48 (patch) | |
tree | c528e3ea964111b934ae5e61e847831d62944f41 /src | |
parent | 8d6fbb43c9bc840d4217bf4f0b49b1213f1601a1 (diff) | |
download | melon-98691d4203f4e578b84b2014db0fbe0c1209cc48.tar.gz melon-98691d4203f4e578b84b2014db0fbe0c1209cc48.tar.zst |
Add HN text extraction.
Diffstat (limited to 'src')
-rw-r--r-- | src/GNUmakefile | 2 | ||||
-rw-r--r-- | src/Makefile | 2 | ||||
-rw-r--r-- | src/cnki.c | 8 | ||||
-rw-r--r-- | src/cnki.h | 4 | ||||
-rw-r--r-- | src/cnki_hn.c | 259 | ||||
-rw-r--r-- | src/cnki_zlib.c | 20 | ||||
-rw-r--r-- | src/iconv.c | 4 | ||||
-rw-r--r-- | src/iconv.h | 3 | ||||
-rw-r--r-- | src/melon.c | 8 | ||||
-rw-r--r-- | src/pdf.h | 2 | ||||
-rw-r--r-- | src/pdf_cnki.c | 6 | ||||
-rw-r--r-- | src/pdf_writer.c | 9 | ||||
-rw-r--r-- | src/zlib.c | 31 | ||||
-rw-r--r-- | src/zlib.h | 8 |
14 files changed, 325 insertions, 41 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile index e730845..1263005 100644 --- a/src/GNUmakefile +++ b/src/GNUmakefile @@ -10,7 +10,7 @@ obj = ${src:.c=.o} PREFIX = /usr/local CFLAGS = -O3 -march=native -pipe -Wall -LDFLAGS = -Wl,-O3 -lcrypto -Wl,--as-needed +LDFLAGS = -Wl,-O3 -lcrypto -lz -Wl,--as-needed all: ${obj} ${CC} ${LDFLAGS} -o melon $^ diff --git a/src/Makefile b/src/Makefile index 8bd27dd..eb62818 100644 --- a/src/Makefile +++ b/src/Makefile @@ -10,7 +10,7 @@ obj = ${src:.c=.o} PREFIX = /usr/local CFLAGS = -O3 -march=native -pipe -flto=thin -Wall -LDFLAGS = -Wl,-O3 -lcrypto -Wl,--as-needed +LDFLAGS = -Wl,-O3 -lcrypto -lz -Wl,--as-needed all: ${obj} ${CC} ${LDFLAGS} -o melon $> @@ -69,15 +69,15 @@ cnki_info(cnki_t **param) if ((*param)->stat > 0) printf("File type is '%s'\n", (*param)->file_stat->type); - if (strcmp((*param)->file_stat->type, "%PDF") == 0) { + if (strncmp((*param)->file_stat->type, "%PDF", 4) == 0) { return 0; - } else if (strcmp((*param)->file_stat->type, "CAJ") == 0) { + } else if (strncmp((*param)->file_stat->type, "CAJ", 3) == 0) { addr[0] = ADDRESS_CAJ_PAGE; addr[1] = ADDRESS_CAJ_OUTLINE; - } else if (strcmp((*param)->file_stat->type, "HN") == 0) { + } else if (strncmp((*param)->file_stat->type, "HN", 2) == 0) { addr[0] = ADDRESS_HN_PAGE; addr[1] = ADDRESS_HN_OUTLINE; - } else if (strcmp((*param)->file_stat->type, "KDH ") == 0) { + } else if (strncmp((*param)->file_stat->type, "KDH ", 4) == 0) { return 0; } else { return 1; @@ -64,7 +64,7 @@ typedef struct _object_hn_t { int32_t text_size; int16_t image_length; int16_t page; - int32_t zero[2]; + int32_t unknown[2]; /* TODO: what is it? */ char *text; struct _hn_image_t *image_data; struct _object_hn_t *next; @@ -88,6 +88,8 @@ int cnki_outline_tree(object_outline_tree_t **outline_tree, object_outline_t **outline, int *ids); /* cnki_zlib.c */ +int cnki_zlib(char **dst, int *dst_size, + const char * restrict src, int src_size); /* cnki_xml.c */ int cnki_xml(char **xml, FILE **fp); diff --git a/src/cnki_hn.c b/src/cnki_hn.c index 978aa30..d402c0c 100644 --- a/src/cnki_hn.c +++ b/src/cnki_hn.c @@ -5,8 +5,10 @@ */ #include <stdlib.h> +#include <string.h> #include "cnki.h" +#include "iconv.h" #include "pdf.h" #include "pdf_cnki.h" @@ -29,13 +31,12 @@ cnki_hn(cnki_t **param) if ((*param)->stat > 1) { printf("Loading page(s)\n"); - printf("\t%8s\t%8s\t%6s\t%4s\t%6s\t%6s\t%4s\t%8s\t%8s\n", + printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4s\t%8s\t%8s\n", "address", "text", "length", "page", - "zero", - "#", + "unknown", "code", "address", "image"); @@ -47,7 +48,7 @@ cnki_hn(cnki_t **param) fread(&ptr->text_size, 4, 1, (*param)->fp_i); fread(&ptr->image_length, 2, 1, (*param)->fp_i); fread(&ptr->page, 2, 1, (*param)->fp_i); - fread(&ptr->zero, 8, 1, (*param)->fp_i); + fread(&ptr->unknown, 8, 1, (*param)->fp_i); ptr->text = NULL; ptr->image_data = NULL; @@ -74,13 +75,13 @@ cnki_hn(cnki_t **param) fread(ptr->text, ptr->text_size, 1, (*param)->fp_i); if ((*param)->stat > 1) - printf("\t%08x\t%8d\t%6d\t%4d\t{%d, %d}", + printf("\t%08x\t%8d\t%6d\t%4d\t{%4d, %8d}", ptr->address, ptr->text_size, ptr->image_length, ptr->page, - ptr->zero[0], - ptr->zero[1]); + ptr->unknown[0], + ptr->unknown[1]); ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t)); @@ -91,6 +92,9 @@ cnki_hn(cnki_t **param) fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i); fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i); fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i); + fseek((*param)->fp_i, + ptr->image_data[i].address + ptr->image_data[i].size, + SEEK_SET); } for (int i = 0; i < ptr->image_length; i++) { @@ -104,12 +108,24 @@ cnki_hn(cnki_t **param) ptr->image_data[i].size, 1, (*param)->fp_i); - if ((*param)->stat > 1) - printf("\t%6d\t%4d\t%08x\t%8d\n", - i, - ptr->image_data[i].format, - ptr->image_data[i].address, - ptr->image_data[i].size); + if ((*param)->stat > 1) { + if (i == 0) { + printf("\t%4d\t%08x\t%8d\n", + ptr->image_data[i].format, + ptr->image_data[i].address, + ptr->image_data[i].size); + } else { + printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4d\t%08x\t%8d\n", + "", + "", + "", + "", + "", + ptr->image_data[i].format, + ptr->image_data[i].address, + ptr->image_data[i].size); + } + } } ptr = ptr->next; @@ -119,16 +135,227 @@ cnki_hn(cnki_t **param) printf("Loaded %d page(s)\n", (*param)->file_stat->page); if ((*param)->stat > 1) - printf("Creating PDF object(s)\n"); + printf("Generating PDF object(s)\n"); pdf_object_t *pdf = NULL; if (pdf_obj_create(&pdf) != 0) return 1; + int buf_size; + char *buf; + + int str_size; + char *str; + + int conv_size; + char *conv_dst; + char conv_src[2]; + char conv_hex[3]; + + ptr = (*param)->object_hn; + while (ptr != NULL) { + if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0) { + cnki_zlib(&buf, &buf_size, ptr->text, ptr->text_size); + + str_size = buf_size / 8 + 7; + str = malloc(str_size); + + if (str == NULL) + return 1; + + memset(str, 0, str_size); + + strcat(str, "<feff"); + + for (int i = 0; i < buf_size; i += 16) { + conv_src[0] = buf[i + 7]; + conv_src[1] = buf[i + 6]; + + conv_size = 512; + + if (strconv(&conv_dst, "UTF-16BE", + conv_src, "GB18030", &conv_size) == 0) { + for (int j = 0; j < conv_size - 2; j++) { + snprintf(conv_hex, 3, + "%02x", (unsigned char) conv_dst[j]); + strcat(str, conv_hex); + } + free(conv_dst); + } + } + free(buf); + + strcat(str, ">"); + } else { + str_size = ptr->text_size; + str = malloc(str_size); + + if (str == NULL) + return 1; + + memset(str, 0, str_size); + + strcat(str, "<feff"); + + for (int i = 0; i < ptr->text_size; i += 4) { + conv_src[0] = ptr->text[i + 3]; + conv_src[1] = ptr->text[i + 2]; + + conv_size = 512; + + if (strconv(&conv_dst, "UTF-16BE", + conv_src, "GB18030", &conv_size) == 0) { + for (int j = 0; j < conv_size - 2; j++) { + snprintf(conv_hex, 3, + "%02x", (unsigned char) conv_dst[j]); + strcat(str, conv_hex); + } + free(conv_dst); + } + } + + strcat(str, ">"); + } + + pdf_obj_append(&pdf, 0, str, NULL, NULL); + + free(str); + + ptr = ptr->next; + } + + if ((*param)->stat > 1) { + printf("\t%8s\t%12s\t%12s\t%12s\n", + "id", + "object", + "dictionary", + "stream"); + + pdf_object_t *ptr = pdf->next; + while (ptr != NULL) { + printf("\t%8d\t%12d\t%12d\t%12d\n", + ptr->id, + ptr->object_size, + ptr->dictionary_size, + ptr->stream_size); + ptr = ptr->next; + } + } + + if ((*param)->stat > 0) + printf("Generated %d object(s)\n", + pdf_get_count(&pdf)); + + int *ids = NULL; + + if ((*param)->file_stat->outline > 0) { + if ((*param)->stat > 1) + printf("Generating outline object(s)\n\t%8s\n", "id"); + + pdf_get_free_ids(&pdf, &ids, (*param)->file_stat->outline + 1); + int outline = pdf_cnki_outline(&pdf, &(*param)->object_outline, &ids); + + if ((*param)->stat > 1) + for (int i = 0; i < (*param)->file_stat->outline + 1; i++) + printf("\t%8d\n", ids[i]); + + if ((*param)->stat > 0) { + if (outline != 0) + printf("No outline information\n"); + else + printf("Generated %d outline object(s)\n", + (*param)->file_stat->outline + 1); + } + } + + if ((*param)->stat > 1) + printf("Writing header\n"); + + long cur = 0; + + if ((*param)->stat > 0) + cur = ftell((*param)->fp_o); + + if (pdf_dump_header(&pdf, &(*param)->fp_o) != 0) { + fprintf(stderr, "Header not written\n"); + return 1; + } else { + if ((*param)->stat > 0) + printf("Header %ld byte(s) written\n", + ftell((*param)->fp_o) - cur); + } + + if ((*param)->stat > 1) + printf("Writing object(s)\n"); + + pdf_dump_obj(&pdf, &(*param)->fp_o); + + if ((*param)->stat > 1) { + printf("\t%8s\t%8s\t%8s\t%12s\t%12s\t%12s\n", + "address", + "size", + "id", + "object", + "dictionary", + "stream"); + + pdf_object_t *ptr = pdf->next; + while (ptr != NULL) { + printf("\t%08x\t%8d\t%8d\t%12d\t%12d\t%12d\n", + ptr->address, + ptr->size, + ptr->id, + ptr->object_size, + ptr->dictionary_size, + ptr->stream_size); + ptr = ptr->next; + } + } + + if ((*param)->stat > 0) + printf("%d object(s) %ld byte(s) written\n", + pdf_get_count(&pdf), + ftell((*param)->fp_o)); + + long xref = ftell((*param)->fp_o); + + if ((*param)->stat > 1) + printf("Writing cross-reference table\n"); + + if (pdf_dump_xref(&pdf, &(*param)->fp_o) != 0) { + if ((*param)->stat > 0) + printf("Cross-reference table not written\n"); + } else { + if ((*param)->stat > 0) + printf("Cross-reference table %ld byte(s) written\n", + ftell((*param)->fp_o) - xref); + } + + if ((*param)->stat > 1) + printf("Writing trailer\n"); + + if ((*param)->stat > 0) + cur = ftell((*param)->fp_o); + + if (pdf_dump_trailer(&pdf, &(*param)->fp_o, xref) != 0) { + if ((*param)->stat > 0) + printf("Trailer not written\n"); + } else { + if ((*param)->stat > 0) + printf("Trailer %ld byte(s) written\n", + ftell((*param)->fp_o) - cur); + } + + if ((*param)->stat > 0) + printf("Total %ld byte(s) written\n", + ftell((*param)->fp_o)); + + pdf_obj_destroy(&pdf); + if ((*param)->stat > 0) - printf("Conversion ended\n"); + printf("Conversion ended (partial)\n"); /* TODO: Finish me please :) */ - return 1; + return 0; } diff --git a/src/cnki_zlib.c b/src/cnki_zlib.c index 4355433..fd4cedf 100644 --- a/src/cnki_zlib.c +++ b/src/cnki_zlib.c @@ -4,4 +4,22 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include <zlib.h> +#include <stdint.h> +#include <string.h> + +#include "zlib.h" + +int +cnki_zlib(char **dst, int *dst_size, + const char * restrict src, int src_size) +{ + int32_t size; + memcpy(&size, src + 20, 4); + + *dst_size = size; + + if (strinflate(dst, size, src + 24, size - 24) != 0) + return 1; + + return 0; +} diff --git a/src/iconv.c b/src/iconv.c index eadfb4b..f5a3dbe 100644 --- a/src/iconv.c +++ b/src/iconv.c @@ -9,7 +9,6 @@ #include <iconv.h> -/* So, why would anyone use something other than UTF-8? */ int strconv(char **dst, const char * restrict dst_code, @@ -51,8 +50,7 @@ strconv(char **dst, free(src_start); return 1; } else { - /* Not including NULL */ - *size -= dst_size + 2; + *size -= dst_size; *dst = malloc(*size); diff --git a/src/iconv.h b/src/iconv.h index 50019bc..da7fefa 100644 --- a/src/iconv.h +++ b/src/iconv.h @@ -4,8 +4,7 @@ * SPDX-License-Identifier: Apache-2.0 */ -int -strconv(char **dst, +int strconv(char **dst, const char * restrict dst_code, const char * restrict src, const char * restrict src_code, diff --git a/src/melon.c b/src/melon.c index 62b742d..375cf09 100644 --- a/src/melon.c +++ b/src/melon.c @@ -86,25 +86,25 @@ main(int argc, char **argv, char **envp) cnki_info(¶m); - if (strcmp(param->file_stat->type, "%PDF") == 0) { + if (strncmp(param->file_stat->type, "%PDF", 4) == 0) { if (cnki_pdf(¶m) != 0) { fprintf(stderr, "%s: %s\n", argv[0], strerror(errno)); return EXIT_FAILURE; } - } else if (strcmp(param->file_stat->type, "CAJ") == 0) { + } else if (strncmp(param->file_stat->type, "CAJ", 3) == 0) { if (cnki_caj(¶m) != 0) { fprintf(stderr, "%s: %s\n", argv[0], strerror(errno)); return EXIT_FAILURE; } - } else if (strcmp(param->file_stat->type, "HN") == 0) { + } else if (strncmp(param->file_stat->type, "HN", 2) == 0) { if (cnki_hn(¶m) != 0) { fprintf(stderr, "%s: %s\n", argv[0], strerror(errno)); return EXIT_FAILURE; } - } else if (strcmp(param->file_stat->type, "KDH ") == 0) { + } else if (strncmp(param->file_stat->type, "KDH ", 4) == 0) { if (cnki_kdh(¶m) != 0) { fprintf(stderr, "%s: %s\n", argv[0], strerror(errno)); @@ -21,8 +21,6 @@ typedef struct _pdf_object_t { /* pdf.c */ /* TODO: Rewrite object dictionary */ -/* TODO: Compact object id */ -/* TODO: `mutool clean -gggsz' */ int pdf_obj_create(pdf_object_t **pdf); void pdf_obj_destroy(pdf_object_t **pdf); int pdf_obj_add(pdf_object_t **pdf, int id, diff --git a/src/pdf_cnki.c b/src/pdf_cnki.c index 16d5d64..d69797b 100644 --- a/src/pdf_cnki.c +++ b/src/pdf_cnki.c @@ -50,7 +50,7 @@ _outline(pdf_object_t **pdf, object_outline_tree_t **outline_tree, int id, int * &size) == 0) { strcat(dictionary, "/Title <feff"); - for (int i = 0; i < size; i++) { + for (int i = 0; i < size - 2; i++) { snprintf(buf, 64, "%02x", (unsigned char) str[i]); strcat(dictionary, buf); } @@ -89,7 +89,7 @@ _outline(pdf_object_t **pdf, object_outline_tree_t **outline_tree, int id, int * } /* Page starts from 0 */ - snprintf(buf, 64, "/Dest [%d /XYZ null null null]\n>>\n", + snprintf(buf, 64, "/Dest [%d /XYZ null null null]\n>>", atoi(ptr->item->page) - 1); strcat(dictionary, buf); @@ -123,7 +123,7 @@ pdf_cnki_outline(pdf_object_t **pdf, object_outline_t **outline, int **ids) free(outline_tree); snprintf(buf, 128, - "<<\n/Type Outlines\n/First %d 0 R\n/Last %d 0 R\n/Count %d\n>>\n", + "<<\n/Type Outlines\n/First %d 0 R\n/Last %d 0 R\n/Count %d\n>>", ret[0], ret[1], ret[2]); free(ret); diff --git a/src/pdf_writer.c b/src/pdf_writer.c index 43c4255..8d5fc16 100644 --- a/src/pdf_writer.c +++ b/src/pdf_writer.c @@ -26,12 +26,15 @@ pdf_dump_obj(pdf_object_t **pdf, FILE **fp) fprintf(*fp, "%d 0 obj\n", ptr->id); - if (ptr->dictionary != NULL) + if (ptr->dictionary != NULL) { fputs(ptr->dictionary, *fp); - else if (ptr->object != NULL) + fputs("\n", *fp); + } else if (ptr->object != NULL) { fputs(ptr->object, *fp); - else if (ptr->stream == NULL) + fputs("\n", *fp); + } else if (ptr->stream == NULL) { fputs("null\n", *fp); + } if (ptr->stream != NULL) { fputs("stream\r\n", *fp); diff --git a/src/zlib.c b/src/zlib.c new file mode 100644 index 0000000..49004b7 --- /dev/null +++ b/src/zlib.c @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2020, yzrh <yzrh@noema.org> + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include <stdlib.h> +#include <string.h> + +#include <zlib.h> + +int +strinflate(char **dst, int dst_size, + const char * restrict src, int src_size) +{ + *dst = malloc(dst_size); + + if (*dst == NULL) + return 1; + + unsigned long size = dst_size; + + uncompress((Bytef *) *dst, &size, (const Bytef *) src, src_size); + + if (size != dst_size) { + free(*dst); + return 1; + } + + return 0; +} diff --git a/src/zlib.h b/src/zlib.h new file mode 100644 index 0000000..1563c6c --- /dev/null +++ b/src/zlib.h @@ -0,0 +1,8 @@ +/* + * Copyright (c) 2020, yzrh <yzrh@noema.org> + * + * SPDX-License-Identifier: Apache-2.0 + */ + +int strinflate(char **dst, int dst_size, + const char * restrict src, int src_size); |