diff options
author | yzrh <yzrh@noema.org> | 2023-01-04 13:51:13 +0000 |
---|---|---|
committer | yzrh <yzrh@noema.org> | 2023-01-04 13:51:13 +0000 |
commit | 8276423eb8395eae3e3002442307272eff1c9e8f (patch) | |
tree | 9aff1e7d1617e4af59beb549952dbc7e22f2b65d /src | |
parent | 7ac0971a1711233bc0eaa5e8191590612959867b (diff) | |
download | melon-8276423eb8395eae3e3002442307272eff1c9e8f.tar.gz melon-8276423eb8395eae3e3002442307272eff1c9e8f.tar.zst |
Prioritise incomplete object during deduplication.
Signed-off-by: yzrh <yzrh@noema.org>
Diffstat (limited to 'src')
-rw-r--r-- | src/cnki_pdf.c | 30 | ||||
-rw-r--r-- | src/pdf_parser.c | 25 |
2 files changed, 42 insertions, 13 deletions
diff --git a/src/cnki_pdf.c b/src/cnki_pdf.c index af38aa6..76931ea 100644 --- a/src/cnki_pdf.c +++ b/src/cnki_pdf.c @@ -163,10 +163,16 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf) printf("Deleting duplicated object\n"); ptr = *pdf; - while (ptr != NULL && ptr->next != NULL) { - if (ptr->id == ptr->next->id) { - pdf_get_obj(&ptr, ptr->id, &tmp); - pdf_obj_del(&ptr, ptr->id); + while (ptr->next != NULL && ptr->next->next != NULL) { + if (ptr->next->id == ptr->next->next->id) { + /* Keep the bigger one, the smaller one is usually incomplete */ + if (ptr->next->size < ptr->next->next->size) { + pdf_get_obj(&ptr, ptr->next->id, &tmp); + pdf_obj_del(&ptr, ptr->next->id); + } else { + pdf_get_obj(&ptr->next, ptr->next->id, &tmp); + pdf_obj_del(&ptr->next, ptr->next->id); + } tmp->next = NULL; pdf_obj_destroy(&tmp); @@ -174,7 +180,9 @@ _pdf_obj_dedup(cnki_t **param, pdf_object_t **pdf) ret++; if ((*param)->stat > 1) - printf("Deleted duplicated object %d.\n", ptr->id); + printf("Deleted duplicated object %d.\n", ptr->next->id); + + continue; } ptr = ptr->next; @@ -236,6 +244,10 @@ cnki_pdf(cnki_t **param) printf("Loaded %d object(s)\n", pdf_get_count(&pdf)); + pdf_obj_sort(&pdf); + + _pdf_obj_dedup(param, &pdf); + int dictionary_size; char *dictionary; @@ -301,7 +313,7 @@ cnki_pdf(cnki_t **param) snprintf(buf, 64, "]\n/Count %d\n>>", - pdf_get_kid_count(&pdf, parent[i])); + pdf_get_kid_count(&pdf, parent[i]) > 0 ? pdf_get_kid_count(&pdf, parent[i]) : kid[0]); strcat(dictionary, buf); pdf_obj_prepend(&pdf, parent[i], NULL, dictionary, NULL, 0); @@ -354,7 +366,7 @@ cnki_pdf(cnki_t **param) } else { for (int i = 0; i < parent[0]; i++) if (parent_missing[i] == 1) - root = i; + root = parent[i + 1]; } if (root == 0) @@ -471,8 +483,6 @@ cnki_pdf(cnki_t **param) _pdf_obj_sort(param, &pdf); - _pdf_obj_dedup(param, &pdf); - _pdf_dump(param, &pdf); pdf_obj_destroy(&pdf); @@ -510,7 +520,7 @@ cnki_pdf_hn(cnki_t **param) if (root_kid == NULL) return 1; - memset(root_kid, 0, (*param)->file_stat->page); + memset(root_kid, 0, (*param)->file_stat->page * sizeof(int)); object_hn_t *ptr = (*param)->object_hn; while (ptr != NULL) { diff --git a/src/pdf_parser.c b/src/pdf_parser.c index 781bafa..ed7bfba 100644 --- a/src/pdf_parser.c +++ b/src/pdf_parser.c @@ -183,7 +183,7 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) ptr->size - (tail - buf) - 3, ">>", 2)) != NULL && memmem(tail + 3, - ptr->size - (tail - buf) - 3, + (tmp - tail) - 3, "stream\r\n", 8) == NULL) tail = tmp; @@ -226,8 +226,27 @@ pdf_load(pdf_object_t **pdf, FILE **fp, int size_buf) free(buf); } else { - ptr->object_size = ptr->size; - ptr->object = buf; + /* Handle incomplete object */ + head = buf; + while ((tmp = _memmem_whitespace(head, + ptr->size - (head - buf), + " 0 obj", 6)) != NULL) + head = tmp + 7; + + if (head - buf > 0) { + ptr->object_size = ptr->size - (head - buf); + ptr->object = malloc(ptr->object_size); + + if (ptr->object == NULL) + return 1; + + memcpy(ptr->object, head, ptr->object_size); + + free(buf); + } else { + ptr->object_size = ptr->size; + ptr->object = buf; + } } ptr = ptr->next; |