aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authoryzrh <yzrh@noema.org>2023-01-04 13:51:13 +0000
committeryzrh <yzrh@noema.org>2023-01-04 13:51:13 +0000
commit8276423eb8395eae3e3002442307272eff1c9e8f (patch)
tree9aff1e7d1617e4af59beb549952dbc7e22f2b65d
parent7ac0971a1711233bc0eaa5e8191590612959867b (diff)
downloadmelon-8276423eb8395eae3e3002442307272eff1c9e8f.tar.gz
melon-8276423eb8395eae3e3002442307272eff1c9e8f.tar.zst
Prioritise incomplete object during deduplication.
Signed-off-by: yzrh <yzrh@noema.org>
-rw-r--r--CHANGE.md3
-rw-r--r--src/cnki_pdf.c30
-rw-r--r--src/pdf_parser.c25
3 files changed, 45 insertions, 13 deletions
diff --git a/CHANGE.md b/CHANGE.md
index 063d93a..3e05e10 100644
--- a/CHANGE.md
+++ b/CHANGE.md
@@ -2,6 +2,8 @@
==================
* Support HN text overlay.
+* Handle invalid PDF object token in CAJ and KDH.
+* Handle inaccuracy page count in CAJ and KDH.
0.2.5 (2023-01-XX)
==================
@@ -9,6 +11,7 @@
* Improve PDF parser.
* Handle duplicated object in CAJ.
* Handle duplicated image in HN.
+* Handle incomplete PDF object in CAJ and KDH.
* Fix JBIG decoder.
0.2.4 (2022-12-31)
diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c
index af38aa6..76931ea 100644
--- a/src/cnki_pdf.c
+++ b/src/cnki_pdf.c
@@ -163,10 +163,16 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf)
printf("Deleting duplicated object\n");
ptr = *pdf;
- while (ptr != NULL && ptr->next != NULL) {
- if (ptr->id == ptr->next->id) {
- pdf_get_obj(&ptr, ptr->id, &tmp);
- pdf_obj_del(&ptr, ptr->id);
+ while (ptr->next != NULL && ptr->next->next != NULL) {
+ if (ptr->next->id == ptr->next->next->id) {
+ /* Keep the bigger one, the smaller one is usually incomplete */
+ if (ptr->next->size < ptr->next->next->size) {
+ pdf_get_obj(&ptr, ptr->next->id, &tmp);
+ pdf_obj_del(&ptr, ptr->next->id);
+ } else {
+ pdf_get_obj(&ptr->next, ptr->next->id, &tmp);
+ pdf_obj_del(&ptr->next, ptr->next->id);
+ }
tmp->next = NULL;
pdf_obj_destroy(&tmp);
@@ -174,7 +180,9 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf)
ret++;
if ((*param)->stat > 1)
- printf("Deleted duplicated object %d.\n", ptr->id);
+ printf("Deleted duplicated object %d.\n", ptr->next->id);
+
+ continue;
}
ptr = ptr->next;
@@ -236,6 +244,10 @@ cnki_pdf(cnki_t **param)
printf("Loaded %d object(s)\n",
pdf_get_count(&pdf));
+ pdf_obj_sort(&pdf);
+
+ _pdf_obj_dedup(param, &pdf);
+
int dictionary_size;
char *dictionary;
@@ -301,7 +313,7 @@ cnki_pdf(cnki_t **param)
snprintf(buf, 64,
"]\n/Count %d\n>>",
- pdf_get_kid_count(&pdf, parent[i]));
+ pdf_get_kid_count(&pdf, parent[i]) > 0 ? pdf_get_kid_count(&pdf, parent[i]) : kid[0]);
strcat(dictionary, buf);
pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL, 0);
@@ -354,7 +366,7 @@ cnki_pdf(cnki_t **param)
} else {
for (int i = 0; i < parent[0]; i++)
if (parent_missing[i] == 1)
- root = i;
+ root = parent[i + 1];
}
if (root == 0)
@@ -471,8 +483,6 @@ cnki_pdf(cnki_t **param)
_pdf_obj_sort(param, &pdf);
- _pdf_obj_dedup(param, &pdf);
-
_pdf_dump(param, &pdf);
pdf_obj_destroy(&pdf);
@@ -510,7 +520,7 @@ cnki_pdf_hn(cnki_t **param)
if (root_kid == NULL)
return 1;
- memset(root_kid, 0, (*param)->file_stat->page);
+ memset(root_kid, 0, (*param)->file_stat->page * sizeof(int));
object_hn_t *ptr = (*param)->object_hn;
while (ptr != NULL) {
diff --git a/src/pdf_parser.c b/src/pdf_parser.c
index 781bafa..ed7bfba 100644
--- a/src/pdf_parser.c
+++ b/src/pdf_parser.c
@@ -183,7 +183,7 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
ptr->size - (tail - buf) - 3,
">>", 2)) != NULL &&
memmem(tail + 3,
- ptr->size - (tail - buf) - 3,
+ (tmp - tail) - 3,
"stream\r\n", 8) == NULL)
tail = tmp;
@@ -226,8 +226,27 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf)
free(buf);
} else {
- ptr->object_size = ptr->size;
- ptr->object = buf;
+ /* Handle incomplete object */
+ head = buf;
+ while ((tmp = _memmem_whitespace(head,
+ ptr->size - (head - buf),
+ " 0 obj", 6)) != NULL)
+ head = tmp + 7;
+
+ if (head - buf > 0) {
+ ptr->object_size = ptr->size - (head - buf);
+ ptr->object = malloc(ptr->object_size);
+
+ if (ptr->object == NULL)
+ return 1;
+
+ memcpy(ptr->object, head, ptr->object_size);
+
+ free(buf);
+ } else {
+ ptr->object_size = ptr->size;
+ ptr->object = buf;
+ }
}
ptr = ptr->next;