aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authoryzrh <yzrh@noema.org>2020-12-30 03:09:00 +0000
committeryzrh <yzrh@noema.org>2020-12-30 03:09:00 +0000
commit98691d4203f4e578b84b2014db0fbe0c1209cc48 (patch)
treec528e3ea964111b934ae5e61e847831d62944f41 /src
parent8d6fbb43c9bc840d4217bf4f0b49b1213f1601a1 (diff)
downloadmelon-98691d4203f4e578b84b2014db0fbe0c1209cc48.tar.gz
melon-98691d4203f4e578b84b2014db0fbe0c1209cc48.tar.zst
Add HN text extraction.
Diffstat (limited to 'src')
-rw-r--r--src/GNUmakefile2
-rw-r--r--src/Makefile2
-rw-r--r--src/cnki.c8
-rw-r--r--src/cnki.h4
-rw-r--r--src/cnki_hn.c259
-rw-r--r--src/cnki_zlib.c20
-rw-r--r--src/iconv.c4
-rw-r--r--src/iconv.h3
-rw-r--r--src/melon.c8
-rw-r--r--src/pdf.h2
-rw-r--r--src/pdf_cnki.c6
-rw-r--r--src/pdf_writer.c9
-rw-r--r--src/zlib.c31
-rw-r--r--src/zlib.h8
14 files changed, 325 insertions, 41 deletions
diff --git a/src/GNUmakefile b/src/GNUmakefile
index e730845..1263005 100644
--- a/src/GNUmakefile
+++ b/src/GNUmakefile
@@ -10,7 +10,7 @@ obj = ${src:.c=.o}
PREFIX = /usr/local
CFLAGS = -O3 -march=native -pipe -Wall
-LDFLAGS = -Wl,-O3 -lcrypto -Wl,--as-needed
+LDFLAGS = -Wl,-O3 -lcrypto -lz -Wl,--as-needed
all: ${obj}
${CC} ${LDFLAGS} -o melon $^
diff --git a/src/Makefile b/src/Makefile
index 8bd27dd..eb62818 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -10,7 +10,7 @@ obj = ${src:.c=.o}
PREFIX = /usr/local
CFLAGS = -O3 -march=native -pipe -flto=thin -Wall
-LDFLAGS = -Wl,-O3 -lcrypto -Wl,--as-needed
+LDFLAGS = -Wl,-O3 -lcrypto -lz -Wl,--as-needed
all: ${obj}
${CC} ${LDFLAGS} -o melon $>
diff --git a/src/cnki.c b/src/cnki.c
index 4218adb..3234403 100644
--- a/src/cnki.c
+++ b/src/cnki.c
@@ -69,15 +69,15 @@ cnki_info(cnki_t **param)
if ((*param)->stat > 0)
printf("File type is '%s'\n", (*param)->file_stat->type);
- if (strcmp((*param)->file_stat->type, "%PDF") == 0) {
+ if (strncmp((*param)->file_stat->type, "%PDF", 4) == 0) {
return 0;
- } else if (strcmp((*param)->file_stat->type, "CAJ") == 0) {
+ } else if (strncmp((*param)->file_stat->type, "CAJ", 3) == 0) {
addr[0] = ADDRESS_CAJ_PAGE;
addr[1] = ADDRESS_CAJ_OUTLINE;
- } else if (strcmp((*param)->file_stat->type, "HN") == 0) {
+ } else if (strncmp((*param)->file_stat->type, "HN", 2) == 0) {
addr[0] = ADDRESS_HN_PAGE;
addr[1] = ADDRESS_HN_OUTLINE;
- } else if (strcmp((*param)->file_stat->type, "KDH ") == 0) {
+ } else if (strncmp((*param)->file_stat->type, "KDH ", 4) == 0) {
return 0;
} else {
return 1;
diff --git a/src/cnki.h b/src/cnki.h
index f9adeba..cf5a818 100644
--- a/src/cnki.h
+++ b/src/cnki.h
@@ -64,7 +64,7 @@ typedef struct _object_hn_t {
int32_t text_size;
int16_t image_length;
int16_t page;
- int32_t zero[2];
+ int32_t unknown[2]; /* TODO: what is it? */
char *text;
struct _hn_image_t *image_data;
struct _object_hn_t *next;
@@ -88,6 +88,8 @@ int cnki_outline_tree(object_outline_tree_t **outline_tree,
object_outline_t **outline, int *ids);
/* cnki_zlib.c */
+int cnki_zlib(char **dst, int *dst_size,
+ const char * restrict src, int src_size);
/* cnki_xml.c */
int cnki_xml(char **xml, FILE **fp);
diff --git a/src/cnki_hn.c b/src/cnki_hn.c
index 978aa30..d402c0c 100644
--- a/src/cnki_hn.c
+++ b/src/cnki_hn.c
@@ -5,8 +5,10 @@
*/
#include <stdlib.h>
+#include <string.h>
#include "cnki.h"
+#include "iconv.h"
#include "pdf.h"
#include "pdf_cnki.h"
@@ -29,13 +31,12 @@ cnki_hn(cnki_t **param)
if ((*param)->stat > 1) {
printf("Loading page(s)\n");
- printf("\t%8s\t%8s\t%6s\t%4s\t%6s\t%6s\t%4s\t%8s\t%8s\n",
+ printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4s\t%8s\t%8s\n",
"address",
"text",
"length",
"page",
- "zero",
- "#",
+ "unknown",
"code",
"address",
"image");
@@ -47,7 +48,7 @@ cnki_hn(cnki_t **param)
fread(&ptr->text_size, 4, 1, (*param)->fp_i);
fread(&ptr->image_length, 2, 1, (*param)->fp_i);
fread(&ptr->page, 2, 1, (*param)->fp_i);
- fread(&ptr->zero, 8, 1, (*param)->fp_i);
+ fread(&ptr->unknown, 8, 1, (*param)->fp_i);
ptr->text = NULL;
ptr->image_data = NULL;
@@ -74,13 +75,13 @@ cnki_hn(cnki_t **param)
fread(ptr->text, ptr->text_size, 1, (*param)->fp_i);
if ((*param)->stat > 1)
- printf("\t%08x\t%8d\t%6d\t%4d\t{%d, %d}",
+ printf("\t%08x\t%8d\t%6d\t%4d\t{%4d, %8d}",
ptr->address,
ptr->text_size,
ptr->image_length,
ptr->page,
- ptr->zero[0],
- ptr->zero[1]);
+ ptr->unknown[0],
+ ptr->unknown[1]);
ptr->image_data = malloc(ptr->image_length * sizeof(hn_image_t));
@@ -91,6 +92,9 @@ cnki_hn(cnki_t **param)
fread(&ptr->image_data[i].format, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].address, 4, 1, (*param)->fp_i);
fread(&ptr->image_data[i].size, 4, 1, (*param)->fp_i);
+ fseek((*param)->fp_i,
+ ptr->image_data[i].address + ptr->image_data[i].size,
+ SEEK_SET);
}
for (int i = 0; i < ptr->image_length; i++) {
@@ -104,12 +108,24 @@ cnki_hn(cnki_t **param)
ptr->image_data[i].size, 1,
(*param)->fp_i);
- if ((*param)->stat > 1)
- printf("\t%6d\t%4d\t%08x\t%8d\n",
- i,
- ptr->image_data[i].format,
- ptr->image_data[i].address,
- ptr->image_data[i].size);
+ if ((*param)->stat > 1) {
+ if (i == 0) {
+ printf("\t%4d\t%08x\t%8d\n",
+ ptr->image_data[i].format,
+ ptr->image_data[i].address,
+ ptr->image_data[i].size);
+ } else {
+ printf("\t%8s\t%8s\t%6s\t%4s\t%16s\t%4d\t%08x\t%8d\n",
+ "",
+ "",
+ "",
+ "",
+ "",
+ ptr->image_data[i].format,
+ ptr->image_data[i].address,
+ ptr->image_data[i].size);
+ }
+ }
}
ptr = ptr->next;
@@ -119,16 +135,227 @@ cnki_hn(cnki_t **param)
printf("Loaded %d page(s)\n", (*param)->file_stat->page);
if ((*param)->stat > 1)
- printf("Creating PDF object(s)\n");
+ printf("Generating PDF object(s)\n");
pdf_object_t *pdf = NULL;
if (pdf_obj_create(&pdf) != 0)
return 1;
+ int buf_size;
+ char *buf;
+
+ int str_size;
+ char *str;
+
+ int conv_size;
+ char *conv_dst;
+ char conv_src[2];
+ char conv_hex[3];
+
+ ptr = (*param)->object_hn;
+ while (ptr != NULL) {
+ if (strncmp(ptr->text + 8, "COMPRESSTEXT", 12) == 0) {
+ cnki_zlib(&buf, &buf_size, ptr->text, ptr->text_size);
+
+ str_size = buf_size / 8 + 7;
+ str = malloc(str_size);
+
+ if (str == NULL)
+ return 1;
+
+ memset(str, 0, str_size);
+
+ strcat(str, "<feff");
+
+ for (int i = 0; i < buf_size; i += 16) {
+ conv_src[0] = buf[i + 7];
+ conv_src[1] = buf[i + 6];
+
+ conv_size = 512;
+
+ if (strconv(&conv_dst, "UTF-16BE",
+ conv_src, "GB18030", &conv_size) == 0) {
+ for (int j = 0; j < conv_size - 2; j++) {
+ snprintf(conv_hex, 3,
+ "%02x", (unsigned char) conv_dst[j]);
+ strcat(str, conv_hex);
+ }
+ free(conv_dst);
+ }
+ }
+ free(buf);
+
+ strcat(str, ">");
+ } else {
+ str_size = ptr->text_size;
+ str = malloc(str_size);
+
+ if (str == NULL)
+ return 1;
+
+ memset(str, 0, str_size);
+
+ strcat(str, "<feff");
+
+ for (int i = 0; i < ptr->text_size; i += 4) {
+ conv_src[0] = ptr->text[i + 3];
+ conv_src[1] = ptr->text[i + 2];
+
+ conv_size = 512;
+
+ if (strconv(&conv_dst, "UTF-16BE",
+ conv_src, "GB18030", &conv_size) == 0) {
+ for (int j = 0; j < conv_size - 2; j++) {
+ snprintf(conv_hex, 3,
+ "%02x", (unsigned char) conv_dst[j]);
+ strcat(str, conv_hex);
+ }
+ free(conv_dst);
+ }
+ }
+
+ strcat(str, ">");
+ }
+
+ pdf_obj_append(&pdf, 0, str, NULL, NULL);
+
+ free(str);
+
+ ptr = ptr->next;
+ }
+
+ if ((*param)->stat > 1) {
+ printf("\t%8s\t%12s\t%12s\t%12s\n",
+ "id",
+ "object",
+ "dictionary",
+ "stream");
+
+ pdf_object_t *ptr = pdf->next;
+ while (ptr != NULL) {
+ printf("\t%8d\t%12d\t%12d\t%12d\n",
+ ptr->id,
+ ptr->object_size,
+ ptr->dictionary_size,
+ ptr->stream_size);
+ ptr = ptr->next;
+ }
+ }
+
+ if ((*param)->stat > 0)
+ printf("Generated %d object(s)\n",
+ pdf_get_count(&pdf));
+
+ int *ids = NULL;
+
+ if ((*param)->file_stat->outline > 0) {
+ if ((*param)->stat > 1)
+ printf("Generating outline object(s)\n\t%8s\n", "id");
+
+ pdf_get_free_ids(&pdf, &ids, (*param)->file_stat->outline + 1);
+ int outline = pdf_cnki_outline(&pdf, &(*param)->object_outline, &ids);
+
+ if ((*param)->stat > 1)
+ for (int i = 0; i < (*param)->file_stat->outline + 1; i++)
+ printf("\t%8d\n", ids[i]);
+
+ if ((*param)->stat > 0) {
+ if (outline != 0)
+ printf("No outline information\n");
+ else
+ printf("Generated %d outline object(s)\n",
+ (*param)->file_stat->outline + 1);
+ }
+ }
+
+ if ((*param)->stat > 1)
+ printf("Writing header\n");
+
+ long cur = 0;
+
+ if ((*param)->stat > 0)
+ cur = ftell((*param)->fp_o);
+
+ if (pdf_dump_header(&pdf, &(*param)->fp_o) != 0) {
+ fprintf(stderr, "Header not written\n");
+ return 1;
+ } else {
+ if ((*param)->stat > 0)
+ printf("Header %ld byte(s) written\n",
+ ftell((*param)->fp_o) - cur);
+ }
+
+ if ((*param)->stat > 1)
+ printf("Writing object(s)\n");
+
+ pdf_dump_obj(&pdf, &(*param)->fp_o);
+
+ if ((*param)->stat > 1) {
+ printf("\t%8s\t%8s\t%8s\t%12s\t%12s\t%12s\n",
+ "address",
+ "size",
+ "id",
+ "object",
+ "dictionary",
+ "stream");
+
+ pdf_object_t *ptr = pdf->next;
+ while (ptr != NULL) {
+ printf("\t%08x\t%8d\t%8d\t%12d\t%12d\t%12d\n",
+ ptr->address,
+ ptr->size,
+ ptr->id,
+ ptr->object_size,
+ ptr->dictionary_size,
+ ptr->stream_size);
+ ptr = ptr->next;
+ }
+ }
+
+ if ((*param)->stat > 0)
+ printf("%d object(s) %ld byte(s) written\n",
+ pdf_get_count(&pdf),
+ ftell((*param)->fp_o));
+
+ long xref = ftell((*param)->fp_o);
+
+ if ((*param)->stat > 1)
+ printf("Writing cross-reference table\n");
+
+ if (pdf_dump_xref(&pdf, &(*param)->fp_o) != 0) {
+ if ((*param)->stat > 0)
+ printf("Cross-reference table not written\n");
+ } else {
+ if ((*param)->stat > 0)
+ printf("Cross-reference table %ld byte(s) written\n",
+ ftell((*param)->fp_o) - xref);
+ }
+
+ if ((*param)->stat > 1)
+ printf("Writing trailer\n");
+
+ if ((*param)->stat > 0)
+ cur = ftell((*param)->fp_o);
+
+ if (pdf_dump_trailer(&pdf, &(*param)->fp_o, xref) != 0) {
+ if ((*param)->stat > 0)
+ printf("Trailer not written\n");
+ } else {
+ if ((*param)->stat > 0)
+ printf("Trailer %ld byte(s) written\n",
+ ftell((*param)->fp_o) - cur);
+ }
+
+ if ((*param)->stat > 0)
+ printf("Total %ld byte(s) written\n",
+ ftell((*param)->fp_o));
+
+ pdf_obj_destroy(&pdf);
+
if ((*param)->stat > 0)
- printf("Conversion ended\n");
+ printf("Conversion ended (partial)\n");
/* TODO: Finish me please :) */
- return 1;
+ return 0;
}
diff --git a/src/cnki_zlib.c b/src/cnki_zlib.c
index 4355433..fd4cedf 100644
--- a/src/cnki_zlib.c
+++ b/src/cnki_zlib.c
@@ -4,4 +4,22 @@
* SPDX-License-Identifier: Apache-2.0
*/
-#include <zlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "zlib.h"
+
+int
+cnki_zlib(char **dst, int *dst_size,
+ const char * restrict src, int src_size)
+{
+ int32_t size;
+ memcpy(&size, src + 20, 4);
+
+ *dst_size = size;
+
+ if (strinflate(dst, size, src + 24, size - 24) != 0)
+ return 1;
+
+ return 0;
+}
diff --git a/src/iconv.c b/src/iconv.c
index eadfb4b..f5a3dbe 100644
--- a/src/iconv.c
+++ b/src/iconv.c
@@ -9,7 +9,6 @@
#include <iconv.h>
-/* So, why would anyone use something other than UTF-8? */
int
strconv(char **dst,
const char * restrict dst_code,
@@ -51,8 +50,7 @@ strconv(char **dst,
free(src_start);
return 1;
} else {
- /* Not including NULL */
- *size -= dst_size + 2;
+ *size -= dst_size;
*dst = malloc(*size);
diff --git a/src/iconv.h b/src/iconv.h
index 50019bc..da7fefa 100644
--- a/src/iconv.h
+++ b/src/iconv.h
@@ -4,8 +4,7 @@
* SPDX-License-Identifier: Apache-2.0
*/
-int
-strconv(char **dst,
+int strconv(char **dst,
const char * restrict dst_code,
const char * restrict src,
const char * restrict src_code,
diff --git a/src/melon.c b/src/melon.c
index 62b742d..375cf09 100644
--- a/src/melon.c
+++ b/src/melon.c
@@ -86,25 +86,25 @@ main(int argc, char **argv, char **envp)
cnki_info(&param);
- if (strcmp(param->file_stat->type, "%PDF") == 0) {
+ if (strncmp(param->file_stat->type, "%PDF", 4) == 0) {
if (cnki_pdf(&param) != 0) {
fprintf(stderr, "%s: %s\n", argv[0],
strerror(errno));
return EXIT_FAILURE;
}
- } else if (strcmp(param->file_stat->type, "CAJ") == 0) {
+ } else if (strncmp(param->file_stat->type, "CAJ", 3) == 0) {
if (cnki_caj(&param) != 0) {
fprintf(stderr, "%s: %s\n", argv[0],
strerror(errno));
return EXIT_FAILURE;
}
- } else if (strcmp(param->file_stat->type, "HN") == 0) {
+ } else if (strncmp(param->file_stat->type, "HN", 2) == 0) {
if (cnki_hn(&param) != 0) {
fprintf(stderr, "%s: %s\n", argv[0],
strerror(errno));
return EXIT_FAILURE;
}
- } else if (strcmp(param->file_stat->type, "KDH ") == 0) {
+ } else if (strncmp(param->file_stat->type, "KDH ", 4) == 0) {
if (cnki_kdh(&param) != 0) {
fprintf(stderr, "%s: %s\n", argv[0],
strerror(errno));
diff --git a/src/pdf.h b/src/pdf.h
index 4cfb81a..394da5a 100644
--- a/src/pdf.h
+++ b/src/pdf.h
@@ -21,8 +21,6 @@ typedef struct _pdf_object_t {
/* pdf.c */
/* TODO: Rewrite object dictionary */
-/* TODO: Compact object id */
-/* TODO: `mutool clean -gggsz' */
int pdf_obj_create(pdf_object_t **pdf);
void pdf_obj_destroy(pdf_object_t **pdf);
int pdf_obj_add(pdf_object_t **pdf, int id,
diff --git a/src/pdf_cnki.c b/src/pdf_cnki.c
index 16d5d64..d69797b 100644
--- a/src/pdf_cnki.c
+++ b/src/pdf_cnki.c
@@ -50,7 +50,7 @@ _outline(pdf_object_t **pdf, object_outline_tree_t **outline_tree, int id, int *
&size) == 0) {
strcat(dictionary, "/Title <feff");
- for (int i = 0; i < size; i++) {
+ for (int i = 0; i < size - 2; i++) {
snprintf(buf, 64, "%02x", (unsigned char) str[i]);
strcat(dictionary, buf);
}
@@ -89,7 +89,7 @@ _outline(pdf_object_t **pdf, object_outline_tree_t **outline_tree, int id, int *
}
/* Page starts from 0 */
- snprintf(buf, 64, "/Dest [%d /XYZ null null null]\n>>\n",
+ snprintf(buf, 64, "/Dest [%d /XYZ null null null]\n>>",
atoi(ptr->item->page) - 1);
strcat(dictionary, buf);
@@ -123,7 +123,7 @@ pdf_cnki_outline(pdf_object_t **pdf, object_outline_t **outline, int **ids)
free(outline_tree);
snprintf(buf, 128,
- "<<\n/Type Outlines\n/First %d 0 R\n/Last %d 0 R\n/Count %d\n>>\n",
+ "<<\n/Type Outlines\n/First %d 0 R\n/Last %d 0 R\n/Count %d\n>>",
ret[0], ret[1], ret[2]);
free(ret);
diff --git a/src/pdf_writer.c b/src/pdf_writer.c
index 43c4255..8d5fc16 100644
--- a/src/pdf_writer.c
+++ b/src/pdf_writer.c
@@ -26,12 +26,15 @@ pdf_dump_obj(pdf_object_t **pdf, FILE **fp)
fprintf(*fp, "%d 0 obj\n", ptr->id);
- if (ptr->dictionary != NULL)
+ if (ptr->dictionary != NULL) {
fputs(ptr->dictionary, *fp);
- else if (ptr->object != NULL)
+ fputs("\n", *fp);
+ } else if (ptr->object != NULL) {
fputs(ptr->object, *fp);
- else if (ptr->stream == NULL)
+ fputs("\n", *fp);
+ } else if (ptr->stream == NULL) {
fputs("null\n", *fp);
+ }
if (ptr->stream != NULL) {
fputs("stream\r\n", *fp);
diff --git a/src/zlib.c b/src/zlib.c
new file mode 100644
index 0000000..49004b7
--- /dev/null
+++ b/src/zlib.c
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2020, yzrh <yzrh@noema.org>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <zlib.h>
+
+int
+strinflate(char **dst, int dst_size,
+ const char * restrict src, int src_size)
+{
+ *dst = malloc(dst_size);
+
+ if (*dst == NULL)
+ return 1;
+
+ unsigned long size = dst_size;
+
+ uncompress((Bytef *) *dst, &size, (const Bytef *) src, src_size);
+
+ if (size != dst_size) {
+ free(*dst);
+ return 1;
+ }
+
+ return 0;
+}
diff --git a/src/zlib.h b/src/zlib.h
new file mode 100644
index 0000000..1563c6c
--- /dev/null
+++ b/src/zlib.h
@@ -0,0 +1,8 @@
+/*
+ * Copyright (c) 2020, yzrh <yzrh@noema.org>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+int strinflate(char **dst, int dst_size,
+ const char * restrict src, int src_size);