From 5e1d32240bb74e6512849fcd3e912aa6f5343271 Mon Sep 17 00:00:00 2001 From: Colin Clark Date: Sat, 30 Jan 2021 14:39:01 +0000 Subject: [PATCH] Find duplicates speed-up for simple comparisons Speed increase for simple comparisons (i.e. all those except similarity checks) This is achieved by using quicksorts and binary searches. --- src/dupe.c | 533 ++++++++++++++++++++++++++++++++++++++++++++++++++--- src/dupe.h | 3 +- 2 files changed, 514 insertions(+), 22 deletions(-) diff --git a/src/dupe.c b/src/dupe.c index 14f8dce7..09dd34b5 100644 --- a/src/dupe.c +++ b/src/dupe.c @@ -75,7 +75,13 @@ enum { DUPE_COLUMN_COUNT /* total columns */ }; +typedef enum { + DUPE_MATCH = 0, + DUPE_NO_MATCH, + DUPE_NAME_MATCH +} DUPE_CHECK_RESULT; +static DupeMatchType param_match_mask; static GList *dupe_window_list = NULL; /* list of open DupeWindow *s */ /* @@ -420,6 +426,7 @@ static void dupe_item_read_cache(DupeItem *di) { di->width = cd->width; di->height = cd->height; + di->dimensions = (di->width << 16) + di->height; } if (!di->md5sum && cd->have_md5sum) { @@ -1299,6 +1306,418 @@ static gboolean dupe_match(DupeItem *a, DupeItem *b, DupeMatchType mask, gdouble return TRUE; } +/** + * @brief Determine if there is a match + * @param di1 + * @param di2 + * @param data + * @returns DUPE_MATCH/DUPE_NO_MATCH/DUPE_NAME_MATCH + * DUPE_NAME_MATCH is used for name != contents searches: + * the name and content match i.e. + * no match, but keep searching + * + * Called when stepping down the array looking for adjacent matches, + * and from the 2nd set search. + * + * Is not used for similarity checks. + */ +static DUPE_CHECK_RESULT dupe_match_check(DupeItem *di1, DupeItem *di2, gpointer data) +{ + DupeWindow *dw = data; + DupeMatchType mask = dw->match_mask; + + if (mask & DUPE_MATCH_ALL) + { + return DUPE_MATCH; + } + if (mask & DUPE_MATCH_PATH) + { + if (utf8_compare(di1->fd->path, di2->fd->path, TRUE) != 0) + { + return DUPE_NO_MATCH; + } + } + if (mask & DUPE_MATCH_NAME) + { + if (g_strcmp0(di1->fd->collate_key_name, di2->fd->collate_key_name) != 0) + { + return DUPE_NO_MATCH; + } + } + if (mask & DUPE_MATCH_NAME_CI) + { + if (g_strcmp0(di1->fd->collate_key_name_nocase, di2->fd->collate_key_name_nocase) != 0 ) + { + return DUPE_NO_MATCH; + } + } + if (mask & DUPE_MATCH_NAME_CONTENT) + { + if (g_strcmp0(di1->fd->collate_key_name, di2->fd->collate_key_name) == 0) + { + if (g_strcmp0(di1->md5sum, di2->md5sum) == 0) + { + return DUPE_NAME_MATCH; + } + } + else + { + return DUPE_NO_MATCH; + } + } + if (mask & DUPE_MATCH_NAME_CI_CONTENT) + { + if (strcmp(di1->fd->collate_key_name_nocase, di2->fd->collate_key_name_nocase) == 0) + { + if (g_strcmp0(di1->md5sum, di2->md5sum) == 0) + { + return DUPE_NAME_MATCH; + } + } + else + { + return DUPE_NO_MATCH; + } + } + if (mask & DUPE_MATCH_SIZE) + { + if (di1->fd->size != di2->fd->size) + { + return DUPE_NO_MATCH; + } + } + if (mask & DUPE_MATCH_DATE) + { + if (di1->fd->date != di2->fd->date) + { + return DUPE_NO_MATCH; + } + } + if (mask & DUPE_MATCH_SUM) + { + if (g_strcmp0(di1->md5sum, di2->md5sum) != 0) + { + return DUPE_NO_MATCH; + } + } + if (mask & DUPE_MATCH_DIM) + { + if (di1->dimensions != di2->dimensions) + { + return DUPE_NO_MATCH; + } + } + + return DUPE_MATCH; +} + +/** + * @brief The callback for the binary search + * @param a + * @param b + * @param param_match_mask + * @returns negative/0/positive + * + * Is not used for similarity checks. + * + * Used only when two file sets are used. + * Requires use of a global for param_match_mask because there is no + * g_array_binary_search_with_data() function in glib. + */ +static gint dupe_match_binary_search_cb(gconstpointer a, gconstpointer b) +{ + const DupeItem *di1 = *((DupeItem **) a); + const DupeItem *di2 = b; + DupeMatchType mask = param_match_mask; + + if (mask & DUPE_MATCH_ALL) + { + return 0; + } + if (mask & DUPE_MATCH_PATH) + { + return utf8_compare(di1->fd->path, di2->fd->path, TRUE); + } + if (mask & DUPE_MATCH_NAME) + { + return g_strcmp0(di1->fd->collate_key_name, di2->fd->collate_key_name); + } + if (mask & DUPE_MATCH_NAME_CI) + { + return strcmp(di1->fd->collate_key_name_nocase, di2->fd->collate_key_name_nocase); + } + if (mask & DUPE_MATCH_NAME_CONTENT) + { + return g_strcmp0(di1->fd->collate_key_name, di2->fd->collate_key_name); + } + if (mask & DUPE_MATCH_NAME_CI_CONTENT) + { + return strcmp(di1->fd->collate_key_name_nocase, di2->fd->collate_key_name_nocase); + } + if (mask & DUPE_MATCH_SIZE) + { + return (di1->fd->size - di2->fd->size); + } + if (mask & DUPE_MATCH_DATE) + { + return (di1->fd->date - di2->fd->date); + } + if (mask & DUPE_MATCH_SUM) + { + return g_strcmp0(di1->md5sum, di2->md5sum); + } + if (mask & DUPE_MATCH_DIM) + { + return (di1->dimensions - di2->dimensions); + } + + return 0; +} + +/** + * @brief The callback for the array sort + * @param a + * @param b + * @param data + * @returns negative/0/positive + * + * Is not used for similarity checks. +*/ +static gint dupe_match_sort_cb(gconstpointer a, gconstpointer b, gpointer data) +{ + const DupeItem *di1 = *((DupeItem **) a); + const DupeItem *di2 = *((DupeItem **) b); + DupeWindow *dw = data; + DupeMatchType mask = dw->match_mask; + + if (mask & DUPE_MATCH_ALL) + { + return 0; + } + if (mask & DUPE_MATCH_PATH) + { + return utf8_compare(di1->fd->path, di2->fd->path, TRUE); + } + if (mask & DUPE_MATCH_NAME) + { + return g_strcmp0(di1->fd->collate_key_name, di2->fd->collate_key_name); + } + if (mask & DUPE_MATCH_NAME_CI) + { + return strcmp(di1->fd->collate_key_name_nocase, di2->fd->collate_key_name_nocase); + } + if (mask & DUPE_MATCH_NAME_CONTENT) + { + return g_strcmp0(di1->fd->collate_key_name, di2->fd->collate_key_name); + } + if (mask & DUPE_MATCH_NAME_CI_CONTENT) + { + return strcmp(di1->fd->collate_key_name_nocase, di2->fd->collate_key_name_nocase); + } + if (mask & DUPE_MATCH_SIZE) + { + return (di1->fd->size - di2->fd->size); + } + if (mask & DUPE_MATCH_DATE) + { + return (di1->fd->date - di2->fd->date); + } + if (mask & DUPE_MATCH_SUM) + { + if (di1->md5sum[0] == '\0' || di2->md5sum[0] == '\0') + { + return -1; + } + else + { + return strcmp(di1->md5sum, di2->md5sum); + } + } + if (mask & DUPE_MATCH_DIM) + { + if (!di1 || !di2 || !di1->width || !di1->height || !di2->width || !di2->height) + { + return -1; + } + return (di1->dimensions - di2->dimensions); + } + + return 0; // should not execute +} + +/** + * @brief Check for duplicate matches + * @param dw + * + * Is not used for similarity checks. + * + * Loads the file sets into an array and sorts on the searched + * for parameter. + * + * If one file set, steps down the array looking for adjacent equal values. + * + * If two file sets, steps down the first set and for each value + * does a binary search for matches in the second set. + */ +static void dupe_array_check(DupeWindow *dw ) +{ + GArray *array_set1; + GArray *array_set2; + GList *work; + gint i_set1; + gint i_set2; + DUPE_CHECK_RESULT check_result; + DupeMatchType mask = dw->match_mask; + param_match_mask = dw->match_mask; + guint out_match_index; + + if (!dw->list) return; + + array_set1 = g_array_new(TRUE, TRUE, sizeof(gpointer)); + array_set2 = g_array_new(TRUE, TRUE, sizeof(gpointer)); + dupe_match_reset_list(dw->list); + + work = dw->list; + while (work) + { + DupeItem *di = work->data; + g_array_append_val(array_set1, di); + work = work->next; + } + + g_array_sort_with_data(array_set1, dupe_match_sort_cb, dw); + + if (dw->second_set) + { + /* Two sets - nothing can be done until a second set is loaded */ + if (dw->second_list) + { + work = dw->second_list; + while (work) + { + DupeItem *di = work->data; + g_array_append_val(array_set2, (work->data)); + work = work->next; + } + g_array_sort_with_data(array_set2, dupe_match_sort_cb, dw); + + for (i_set1 = 0; i_set1 <= (gint)(array_set1->len) - 1; i_set1++) + { + DupeItem *di1 = g_array_index(array_set1, gpointer, i_set1); + DupeItem *di2 = NULL; + /* If multiple identical entries in set 1, use the last one */ + if (i_set1 < (gint)(array_set1->len) - 2) + { + di2 = g_array_index(array_set1, gpointer, i_set1 + 1); + check_result = dupe_match_check(di1, di2, dw); + if (check_result == DUPE_MATCH || check_result == DUPE_NAME_MATCH) + { + continue; + } + } + if (g_array_binary_search(array_set2, di1, dupe_match_binary_search_cb, &out_match_index)) + { + di2 = g_array_index(array_set2, gpointer, out_match_index); + + check_result = dupe_match_check(di1, di2, dw); + if (check_result == DUPE_MATCH || check_result == DUPE_NAME_MATCH) + { + if (check_result == DUPE_MATCH) + { + dupe_match_link(di2, di1, 0.0); + } + i_set2 = out_match_index + 1; + + if (i_set2 > (gint)(array_set2->len) - 1) + { + break; + } + /* Look for multiple matches in set 2 for item di1 */ + di2 = g_array_index(array_set2, gpointer, i_set2); + check_result = dupe_match_check(di1, di2, dw); + while (check_result == DUPE_MATCH || check_result == DUPE_NAME_MATCH) + { + if (check_result == DUPE_MATCH) + { + dupe_match_link(di2, di1, 0.0); + } + i_set2++; + if (i_set2 > (gint)(array_set2->len) - 1) + { + break; + } + di2 = g_array_index(array_set2, gpointer, i_set2); + check_result = dupe_match_check(di1, di2, dw); + } + } + } + } + } + } + else + { + /* File set 1 only */ + g_list_free(dw->dupes); + dw->dupes = NULL; + + if ((gint)(array_set1->len) > 1) + { + for (i_set1 = 0; i_set1 <= (gint)(array_set1->len) - 2; i_set1++) + { + DupeItem *di1 = g_array_index(array_set1, gpointer, i_set1); + DupeItem *di2 = g_array_index(array_set1, gpointer, i_set1 + 1); + + check_result = dupe_match_check(di1, di2, dw); + if (check_result == DUPE_MATCH || check_result == DUPE_NAME_MATCH) + { + if (check_result == DUPE_MATCH) + { + dupe_match_link(di2, di1, 0.0); + } + i_set1++; + + if ( i_set1 + 1 > (gint)(array_set1->len) - 1) + { + break; + } + /* Look for multiple matches for item di1 */ + di2 = g_array_index(array_set1, gpointer, i_set1 + 1); + check_result = dupe_match_check(di1, di2, dw); + while (check_result == DUPE_MATCH || check_result == DUPE_NAME_MATCH) + { + if (check_result == DUPE_MATCH) + { + dupe_match_link(di2, di1, 0.0); + } + i_set1++; + + if (i_set1 + 1 > (gint)(array_set1->len) - 1) + { + break; + } + di2 = g_array_index(array_set1, gpointer, i_set1 + 1); + check_result = dupe_match_check(di1, di2, dw); + } + } + } + } + } + g_array_free(array_set1, TRUE); + g_array_free(array_set2, TRUE); +} + +/** + * @brief Look for similarity match + * @param dw + * @param needle + * @param start + * + * Called from dupe_check_cb. + * Called for each entry in the list. + * Steps through the list looking for matches against needle. + * + * Only used for similarity checks. + */ static void dupe_list_check_match(DupeWindow *dw, DupeItem *needle, GList *start) { GList *work; @@ -1556,18 +1975,22 @@ static GList *dupe_setup_point_step(DupeWindow *dw, GList *p) return NULL; } -static gboolean dupe_check_cb(gpointer data) +/** + * @brief Generates the sumcheck or dimensions + * @param list Set1 or set2 + * @returns TRUE/FALSE = not completed/completed + * + * Ensures that the DIs contain the MD5SUM or dimensions for all items in + * the list. One item at a time. Re-enters if not completed. + */ +static gboolean create_checksums_dimensions(DupeWindow *dw, GList *list) { - DupeWindow *dw = data; - - if (!dw->idle_id) return FALSE; - - if (!dw->setup_done) - { - if ((dw->match_mask & DUPE_MATCH_SUM) && - !(dw->setup_mask & DUPE_MATCH_SUM) ) + if ((dw->match_mask & DUPE_MATCH_SUM) || + (dw->match_mask & DUPE_MATCH_NAME_CONTENT) || + (dw->match_mask & DUPE_MATCH_NAME_CI_CONTENT)) { - if (!dw->setup_point) dw->setup_point = dw->list; + /* MD5SUM only */ + if (!dw->setup_point) dw->setup_point = list; // setup_point clear on 1st entry while (dw->setup_point) { @@ -1584,7 +2007,10 @@ static gboolean dupe_check_cb(gpointer data) if (options->thumbnails.enable_caching) { dupe_item_read_cache(di); - if (di->md5sum) return TRUE; + if (di->md5sum) + { + return TRUE; + } } di->md5sum = md5_text_from_file_utf8(di->fd->path, ""); @@ -1595,13 +2021,13 @@ static gboolean dupe_check_cb(gpointer data) return TRUE; } } - dw->setup_mask |= DUPE_MATCH_SUM; dupe_setup_reset(dw); } - if ((dw->match_mask & DUPE_MATCH_DIM) && - !(dw->setup_mask & DUPE_MATCH_DIM) ) + + if ((dw->match_mask & DUPE_MATCH_DIM) ) { - if (!dw->setup_point) dw->setup_point = dw->list; + /* Dimensions only */ + if (!dw->setup_point) dw->setup_point = list; while (dw->setup_point) { @@ -1617,10 +2043,14 @@ static gboolean dupe_check_cb(gpointer data) if (options->thumbnails.enable_caching) { dupe_item_read_cache(di); - if (di->width != 0 || di->height != 0) return TRUE; + if (di->width != 0 || di->height != 0) + { + return TRUE; + } } image_load_dimensions(di->fd, &di->width, &di->height); + di->dimensions = (di->width << 16) + di->height; if (options->thumbnails.enable_caching) { dupe_item_write_cache(di); @@ -1628,15 +2058,54 @@ static gboolean dupe_check_cb(gpointer data) return TRUE; } } - dw->setup_mask |= DUPE_MATCH_DIM; dupe_setup_reset(dw); } + + return FALSE; +} + +/** + * @brief Check set 1 (and set 2) for matches + * @param data DupeWindow + * @returns TRUE/FALSE = not completed/completed + * + * Initiated from start, loader done and item remove + * + * On first entry generates di->MD5SUM, di->dimensions and sim data, + * and updates the cache. + */ +static gboolean dupe_check_cb(gpointer data) +{ + DupeWindow *dw = data; + + if (!dw->idle_id) + { + return FALSE; + } + + if (!dw->setup_done) /* Clear on 1st entry */ + { + if (dw->list) + { + if (create_checksums_dimensions(dw, dw->list)) + { + return TRUE; + } + } + if (dw->second_list) + { + if (create_checksums_dimensions(dw, dw->second_list)) + { + return TRUE; + } + } if ((dw->match_mask & DUPE_MATCH_SIM_HIGH || dw->match_mask & DUPE_MATCH_SIM_MED || dw->match_mask & DUPE_MATCH_SIM_LOW || dw->match_mask & DUPE_MATCH_SIM_CUSTOM) && !(dw->setup_mask & DUPE_MATCH_SIM_MED) ) { + /* Similarity only */ if (!dw->setup_point) dw->setup_point = dw->list; while (dw->setup_point) @@ -1681,12 +2150,18 @@ static gboolean dupe_check_cb(gpointer data) dw->setup_mask |= DUPE_MATCH_SIM_MED; dupe_setup_reset(dw); } + + /* End of setup not done */ dupe_window_update_progress(dw, _("Comparing..."), 0.0, FALSE); dw->setup_done = TRUE; dupe_setup_reset(dw); dw->setup_count = g_list_length(dw->list); } + /* Setup done - dw->working set to NULL below + * Set before 1st entry: dw->working = g_list_last(dw->list) + * Set before 1st entry: dw->setup_count = g_list_length(dw->list) + */ if (!dw->working) { if (dw->setup_count > 0) @@ -1711,11 +2186,27 @@ static gboolean dupe_check_cb(gpointer data) return FALSE; } - dupe_list_check_match(dw, (DupeItem *)dw->working->data, dw->working); - dupe_window_update_progress(dw, _("Comparing..."), dw->setup_count == 0 ? 0.0 : (gdouble) dw->setup_n / dw->setup_count, FALSE); - dw->setup_n++; + if (dw->match_mask == DUPE_MATCH_SIM_HIGH || + dw->match_mask == DUPE_MATCH_SIM_MED || + dw->match_mask == DUPE_MATCH_SIM_LOW || + dw->match_mask == DUPE_MATCH_SIM_CUSTOM) + { + /* This is the similarity comparison */ + dupe_list_check_match(dw, (DupeItem *)dw->working->data, dw->working); + dupe_window_update_progress(dw, _("Comparing..."), dw->setup_count == 0 ? 0.0 : (gdouble) dw->setup_n / dw->setup_count, FALSE); + dw->setup_n++; - dw->working = dw->working->prev; + dw->working = dw->working->prev; /* Is NULL when complete */ + } + else + { + /* This is the comparison for all other parameters. + * dupe_array_check() processes the entire list in one go + */ + dw->working = NULL; + dupe_window_update_progress(dw, _("Comparing..."), 0.0, FALSE); + dupe_array_check(dw); + } return TRUE; } diff --git a/src/dupe.h b/src/dupe.h index d8970221..9744e1b4 100644 --- a/src/dupe.h +++ b/src/dupe.h @@ -62,6 +62,7 @@ struct _DupeItem gchar *md5sum; gint width; gint height; + gint dimensions; /* Computed as (di->width << 16) + di->height */ ImageSimilarityData *simd; @@ -106,7 +107,7 @@ struct _DupeWindow guint idle_id; /* event source id */ GList *working; gint setup_done; - gint setup_count; + gint setup_count; /* length of set1 or if 2 sets, total length of both */ gint setup_n; /* these are merely for speed optimization */ GList *setup_point; /* ditto */ DupeMatchType setup_mask; /* ditto */ -- 2.20.1