aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authoryzrh <yzrh@noema.org>2023-01-01 19:31:33 +0000
committeryzrh <yzrh@noema.org>2023-01-01 19:31:33 +0000
commit1a1fee1034b7d143a3ad77707ef930f2a8d1e3d8 (patch)
treede9d58c584ec37ca4243926c81e68573dabcaac9 /src
parentcde014cffbe2e8d94de144008ad00bbccbb3a8ab (diff)
downloadmelon-1a1fee1034b7d143a3ad77707ef930f2a8d1e3d8.tar.gz
melon-1a1fee1034b7d143a3ad77707ef930f2a8d1e3d8.tar.zst
Handle duplicated object in CAJ.
Signed-off-by: yzrh <yzrh@noema.org>
Diffstat (limited to 'src')
-rw-r--r--src/cnki_pdf.c73
1 files changed, 47 insertions, 26 deletions
diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c
index 6cb8c9f..90ba28e 100644
--- a/src/cnki_pdf.c
+++ b/src/cnki_pdf.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2022, yzrh <yzrh@noema.org>
+ * Copyright (c) 2020-2023, yzrh <yzrh@noema.org>
*
* SPDX-License-Identifier: Apache-2.0
*/
@@ -145,11 +145,46 @@ _pdf_obj_sort(cnki_t **param, pdf_object_t **pdf)
ret = pdf_obj_sort(pdf);
+ if ((*param)->stat > 0)
+ printf("Sorted object(s)\n");
+
+ return ret;
+}
+
+static int
+_pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf)
+{
+ int ret = 0;
+
+ pdf_object_t *tmp;
+ pdf_object_t *ptr;
+
+ if ((*param)->stat > 1)
+ printf("Deleting duplicated object\n\t%8s\n", "id");
+
+ ptr = *pdf;
+ while (ptr->next != NULL) {
+ if (ptr->id == ptr->next->id) {
+ pdf_get_obj(&ptr, ptr->id, &tmp);
+ pdf_obj_del(&ptr, ptr->id);
+
+ tmp->next = NULL;
+ pdf_obj_destroy(&tmp);
+
+ ret++;
+
+ if ((*param)->stat > 1)
+ printf("\t%8d\n", ptr->id);
+ }
+
+ ptr = ptr->next;
+ }
+
if ((*param)->stat > 0) {
if (ret == 0)
- printf("Sorted object(s)\n");
+ printf("No duplicated object\n");
else
- printf("Object(s) not sorted\n");
+ printf("Deleted %d duplicated object(s)\n", ret);
}
return ret;
@@ -338,12 +373,9 @@ cnki_pdf(cnki_t **param)
printf("Generating root object\n");
snprintf(buf, 64,
- "<<\n/Type /Pages\n/Kids ");
+ "<<\n/Type /Pages\n/Kids [");
strcat(dictionary, buf);
- if (parent[0] > 1)
- strcat(dictionary, "[");
-
for (int i = 0, j = 0; i < parent[0]; i++) {
if (parent_missing[i] == 1) {
snprintf(buf, 64, "%d 0 R", parent[i + 1]);
@@ -354,12 +386,7 @@ cnki_pdf(cnki_t **param)
}
}
- if (parent[0] > 1)
- strcat(dictionary, "]");
-
- strcat(dictionary, "\n");
-
- snprintf(buf, 64, "/Count %d\n", (*param)->file_stat->page);
+ snprintf(buf, 64, "]\n/Count %d\n", (*param)->file_stat->page);
strcat(dictionary, buf);
strcat(dictionary, ">>");
@@ -442,6 +469,8 @@ cnki_pdf(cnki_t **param)
_pdf_obj_sort(param, &pdf);
+ _pdf_obj_dedup(param, &pdf);
+
_pdf_dump(param, &pdf);
pdf_obj_destroy(&pdf);
@@ -721,12 +750,12 @@ cnki_pdf_hn(cnki_t **param)
if ((*param)->stat > 2)
printf("Not extracted.\n");
- pdf_obj_append(&pdf, ids[i], "null", NULL, NULL, 0);
+ pdf_obj_append(&pdf, ids[i], NULL, NULL, NULL, 0);
} else {
if ((*param)->stat > 2)
printf("Unsupported format.\n");
- pdf_obj_append(&pdf, ids[i], "null", NULL, NULL, 0);
+ pdf_obj_append(&pdf, ids[i], NULL, NULL, NULL, 0);
}
}
@@ -1028,7 +1057,7 @@ cnki_pdf_hn(cnki_t **param)
free(stream);
} else {
pdf_obj_append(&pdf, ids[ptr->image_length + 1],
- "null", NULL, NULL, 0);
+ NULL, NULL, NULL, 0);
}
memset(dictionary, 0, dictionary_size);
@@ -1094,12 +1123,9 @@ cnki_pdf_hn(cnki_t **param)
int root = pdf_get_free_id(&pdf);
- snprintf(buf, 64, "<<\n/Type /Pages\n/Kids ");
+ snprintf(buf, 64, "<<\n/Type /Pages\n/Kids [");
strcat(dictionary, buf);
- if ((*param)->file_stat->page > 1)
- strcat(dictionary, "[");
-
for (int i = 0; i < (*param)->file_stat->page; i++) {
snprintf(buf, 64, "%d 0 R", root_kid[i]);
strcat(dictionary, buf);
@@ -1107,12 +1133,7 @@ cnki_pdf_hn(cnki_t **param)
strcat(dictionary, " ");
}
- if ((*param)->file_stat->page > 1)
- strcat(dictionary, "]");
-
- strcat(dictionary, "\n");
-
- snprintf(buf, 64, "/Count %d\n", (*param)->file_stat->page);
+ snprintf(buf, 64, "]\n/Count %d\n", (*param)->file_stat->page);
strcat(dictionary, buf);
strcat(dictionary, ">>");