Document similarity algorithms and enable alternate
authorColin Clark <colin.clark@cclark.uk>
Wed, 18 Oct 2023 11:36:56 +0000 (12:36 +0100)
committerColin Clark <colin.clark@cclark.uk>
Wed, 18 Oct 2023 11:36:56 +0000 (12:36 +0100)
The alternate algorithm can be enabled on Preferences/Advanced.

doc/docbook/GuideOptionsAdvanced.xml
doc/docbook/GuideReference.xml
doc/docbook/GuideReferenceSimilarityAlgorithms.xml [new file with mode: 0644]
src/main.cc
src/options.h
src/preferences.cc
src/rcfile.cc
src/similar.cc

index aacbb75..0c05242 100644 (file)
@@ -92,4 +92,8 @@
     <title>Thread Pools</title>
     <para>This option will limit the number of threads (cores) that are used when performing a duplicate image search. A value of <code>0</code> means use all available threads. This will give the fastest processing time, but will slow other processes including user input response time.</para> 
 </section>
+<section id="AlternateAlgorithm">
+    <title>Alternate Algorithm</title>
+    <para><link linkend="GuideReferenceSimilarityAlgorithms">Alternate Similarity Algorithm</link></para>
+</section>
 </section>
index fb14986..f25a950 100644 (file)
@@ -12,6 +12,7 @@
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="GuideReferenceFileDates.xml" />
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="GuideReferenceTags.xml" />
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="GuideReferenceSupportedFormats.xml" />
+  <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="GuideReferenceSimilarityAlgorithms.xml" />
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="GuideReferencePixbufLoaders.xml" />
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="GuideReferenceStandardPlugins.xml" />
   <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="GuideReferenceUTC.xml" />
diff --git a/doc/docbook/GuideReferenceSimilarityAlgorithms.xml b/doc/docbook/GuideReferenceSimilarityAlgorithms.xml
new file mode 100644 (file)
index 0000000..efdcdb4
--- /dev/null
@@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="utf-8"?>
+<section id="GuideReferenceSimilarityAlgorithms">
+  <title id="titleGuideReferenceSimilarityAlgorithms">Similarity Algorithms</title>
+  <para>
+    This function is intended to find images with similar color content. For example when an image was saved at different compression levels or dimensions (scaled down/up) the contents are similar, but these files do not match by file size, dimensions, or checksum.
+    <para />
+    A 32 x 32 array is created for each image. Imagine the image cut into 1024 rectangles, 32 across and 32 down.
+    <para />
+    For each array element, the average value of all the red and the green and the blue pixels is computed and stored in the array. Therefore the array represents the average color of each corresponding part of the image.
+    <para />
+    This data is stored in a file with the same name is the image and with the extension .sim. It is stored in the same location as thumbnails. If many images are to be compered, run-time is reduced by having these .sim files already created. This can be done via Edit/Cache Maintenance or by the command line instruction:
+    <code>geeqie --cache-maintenance &lt;path&gt;</code>
+  </para>
+  <section id="standard">
+    <title>Standard Algorithm</title>
+    <para>
+      To compare two images, each array element of each image is compared in turn. The computed value is the percent match of all elements of the two images. For this, simple comparisons are used - basically the value is an average of the corresponding array differences.
+      <para />
+      The value computed is in the range 0% to 100%.
+      <literallayout>
+        100% for exact matches (an image is compared to itself)
+        0% for exact opposite images (compare an all black to an all white image)
+      </literallayout>
+      Generally only a match of >85% is significant at all, and >95% is useful to find images that have been re-saved to other formats, dimensions, or compression.
+    </para>
+    <para>If the Ignore Orientation checkbox on the Duplicates window is selected, images are also checked for 90°, 180°, 270°, rotations and mirror and flip. This will increase run-time.</para>
+  </section>
+  <section id="alternate">
+    <title>Alternate Algorithm</title>
+    <para>
+      The alternate algorithm can be enabled on the Advanced tab of Preferences.
+      <para />
+      It does not check for rotations, mirror or flip.
+      <para />
+      After comparing two array elements of two images, the difference from the preceding element comparison is included in the computation.
+      <para />
+      There is an additional option to reduce the fingerprint to grayscale before comparisons are made.
+    </para>
+  </section>
+</section>
index de12173..7e0857e 100644 (file)
@@ -503,12 +503,6 @@ static void parse_command_line(gint argc, gchar *argv[])
                                printf_term(FALSE, "%s %s GTK%d\n", GQ_APPNAME, VERSION, gtk_major_version);
                                exit(0);
                                }
-                       else if (strcmp(cmd_line, "--alternate") == 0)
-                               {
-                               /* enable faster experimental algorithm */
-                               log_printf("Alternate similarity algorithm enabled\n");
-                               image_sim_alternate_set(TRUE);
-                               }
                        else if (strcmp(cmd_line, "-h") == 0 ||
                                 strcmp(cmd_line, "--help") == 0)
                                {
@@ -523,7 +517,7 @@ static void parse_command_line(gint argc, gchar *argv[])
                                print_term(FALSE, _("  -h, --help                       show this message\n"));
                                print_term(FALSE, _("  -l, --list [files] [collections] open collection window for command line\n"));
                                print_term(FALSE, _("  -n, --new-instance               open a new instance of Geeqie\n"));
-                               print_term(FALSE, _("  -o:, --log-file:<file>     save log data to file\n"));
+                               print_term(FALSE, _("  -o:, --log-file:<file>           save log data to file\n"));
                                print_term(FALSE, _("  -r, --remote                     send following commands to open window\n"));
                                print_term(FALSE, _("  -rh, --remote-help               print remote command list\n"));
                                print_term(FALSE, _("  -s, --slideshow                  start in slideshow mode\n"));
@@ -533,14 +527,9 @@ static void parse_command_line(gint argc, gchar *argv[])
                                print_term(FALSE, _("  +w, --show-log-window            show log window\n"));
 #ifdef DEBUG
                                print_term(FALSE, _("      --debug[=level]              turn on debug output\n"));
-                               print_term(FALSE, _("  -g:, --grep:<regexp>     filter debug output\n"));
+                               print_term(FALSE, _("  -g:, --grep:<regexp>             filter debug output\n"));
 #endif
 
-#if 0
-                               /* these options are not officially supported!
-                                * only for testing new features, no need to translate them */
-                               print_term(FALSE, "  --alternate                use alternate similarity algorithm\n");
-#endif
                                print_term(FALSE, "\n");
 
                                remote_help();
index 4d1a940..b38feac 100644 (file)
@@ -392,6 +392,12 @@ struct ConfOptions
                gboolean status_bar;
        } selectable_bars;
 
+       /* Alternate similarity algorithm */
+       struct {
+               gboolean enabled;
+               gboolean grayscale; /**< convert fingerprint to greyscale */
+       } alternate_similarity_algorithm;
+
        gchar *mouse_button_8; /**< user-definable mouse buttons */
        gchar *mouse_button_9; /**< user-definable mouse buttons */
 
index 808849d..6b000da 100644 (file)
@@ -465,6 +465,9 @@ static void config_window_apply()
 
        options->threads.duplicates = c_options->threads.duplicates > 0 ? c_options->threads.duplicates : -1;
 
+       options->alternate_similarity_algorithm.enabled = c_options->alternate_similarity_algorithm.enabled;
+       options->alternate_similarity_algorithm.grayscale = c_options->alternate_similarity_algorithm.grayscale;
+
 #ifdef DEBUG
        set_debug_level(debug_c);
 #endif
@@ -3838,18 +3841,20 @@ static gint extension_sort_cb(gconstpointer a, gconstpointer b)
 
 static void config_tab_advanced(GtkWidget *notebook)
 {
-       GtkWidget *vbox;
-       GtkWidget *group;
-       GSList *formats_list;
-       GList *extensions_list = nullptr;
        gchar **extensions;
-       GtkWidget *tabcomp;
        GdkPixbufFormat *fm;
        gint i;
+       GList *extensions_list = nullptr;
+       GSList *formats_list;
        GString *types_string = g_string_new(nullptr);
-       GtkWidget *types_string_label;
-       GtkWidget *threads_string_label;
+       GtkWidget *alternate_checkbox;
        GtkWidget *dupes_threads_spin;
+       GtkWidget *group;
+       GtkWidget *subgroup;
+       GtkWidget *tabcomp;
+       GtkWidget *threads_string_label;
+       GtkWidget *types_string_label;
+       GtkWidget *vbox;
 
        vbox = scrolled_notebook_page(notebook, _("Advanced"));
        group = pref_group_new(vbox, FALSE, _("External preview extraction"), GTK_ORIENTATION_VERTICAL);
@@ -3929,6 +3934,20 @@ static void config_tab_advanced(GtkWidget *notebook)
 
        dupes_threads_spin = pref_spin_new_int(vbox, _("Duplicate check:"), _("max. threads"), 0, get_cpu_cores(), 1, options->threads.duplicates, &c_options->threads.duplicates);
        gtk_widget_set_tooltip_markup(dupes_threads_spin, _("Set to 0 for unlimited"));
+
+       pref_spacer(group, PREF_PAD_GROUP);
+
+       pref_line(vbox, PREF_PAD_SPACE);
+
+       group = pref_group_new(vbox, FALSE, _("Alternate similarity alogorithm"), GTK_ORIENTATION_VERTICAL);
+
+       alternate_checkbox = pref_checkbox_new_int(group, _("Enable alternate similarity algorithm"), options->alternate_similarity_algorithm.enabled, &c_options->alternate_similarity_algorithm.enabled);
+
+       subgroup = pref_box_new(group, FALSE, GTK_ORIENTATION_VERTICAL, PREF_PAD_GAP);
+       pref_checkbox_link_sensitivity(alternate_checkbox, subgroup);
+
+       alternate_checkbox = pref_checkbox_new_int(subgroup, _("Use grayscale"), options->alternate_similarity_algorithm.grayscale, &c_options->alternate_similarity_algorithm.grayscale);
+       gtk_widget_set_tooltip_text(alternate_checkbox, _("Reduce fingerprint to grayscale"));
 }
 
 /* stereo tab */
index f1cf6d1..2073a0e 100644 (file)
@@ -559,6 +559,11 @@ static void write_global_attributes(GString *outstr, gint indent)
        /* GPU - see main.cc */
        WRITE_NL(); WRITE_BOOL(*options, override_disable_gpu);
        WRITE_SEPARATOR();
+
+       /* Alternate similarity algorithm */
+       WRITE_NL(); WRITE_BOOL(*options, alternate_similarity_algorithm.enabled);
+       WRITE_NL(); WRITE_BOOL(*options, alternate_similarity_algorithm.grayscale);
+       WRITE_SEPARATOR();
 }
 
 static void write_color_profile(GString *outstr, gint indent)
@@ -1050,6 +1055,10 @@ static gboolean load_global_params(const gchar **attribute_names, const gchar **
                /* GPU - see main.cc */
                if (READ_BOOL(*options, override_disable_gpu)) continue;
 
+               /* Alternative similarity algorithm */
+               if (READ_BOOL(*options, alternate_similarity_algorithm.enabled)) continue;
+               if (READ_BOOL(*options, alternate_similarity_algorithm.grayscale)) continue;
+
                /* Dummy options */
                if (READ_DUMMY(*options, image.dither_quality, "deprecated since 2012-08-13")) continue;
 
index f6cb826..bd090ca 100644 (file)
  * find images that have been re-saved to other formats, dimensions, or compression.
  */
 
-/*
- * The experimental (alternate) algorithm is only for testing of new techniques to
- * improve the result, and hopes to reduce false positives.
- */
-
-static gboolean alternate_enabled = FALSE;
-
-void image_sim_alternate_set(gboolean enable)
-{
-       alternate_enabled = enable;
-}
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-function"
-gboolean image_sim_alternate_enabled_unused(void)
-{
-       return alternate_enabled;
-}
-#pragma GCC diagnostic pop
-
 ImageSimilarityData *image_sim_new()
 {
        auto sd = g_new0(ImageSimilarityData, 1);
@@ -146,22 +126,17 @@ static void image_sim_channel_norm(guint8 *pix, gint len)
 }
 
 /*
- * define these to enable various components of the experimental compare functions
- *
- * Convert the thumbprint to greyscale (ignore all color information when comparing)
- *  #define ALTERNATE_USES_GREYSCALE 1
- *
- * Take into account the difference in change from one pixel to the next
- *  #define ALTERNATE_INCLUDE_COMPARE_CHANGE 1
+ * The Alternate algorithm is only for testing of new techniques to
+ * improve the result, and hopes to reduce false positives.
  */
-
 void image_sim_alternate_processing(ImageSimilarityData *sd)
 {
-#ifdef ALTERNATE_USES_GREYSCALE
        gint i;
-#endif
 
-       if (!alternate_enabled) return;
+       if (!options->alternate_similarity_algorithm.enabled)
+               {
+               return;
+               }
 
        image_sim_channel_norm(sd->avg_r, sizeof(sd->avg_r));
        image_sim_channel_norm(sd->avg_g, sizeof(sd->avg_g));
@@ -171,15 +146,16 @@ void image_sim_alternate_processing(ImageSimilarityData *sd)
        image_sim_channel_equal(sd->avg_g, sizeof(sd->avg_g));
        image_sim_channel_equal(sd->avg_b, sizeof(sd->avg_b));
 
-#ifdef ALTERNATE_USES_GREYSCALE
-       for (i = 0; i < sizeof(sd->avg_r); i++)
+       if (options->alternate_similarity_algorithm.grayscale)
                {
-               guint8 n;
+               for (i = 0; i < (gint)sizeof(sd->avg_r); i++)
+                       {
+                       guint8 n;
 
-               n = (guint8)((gint)(sd->avg_r[i] + sd->avg_g[i] + sd->avg_b[i]) / 3);
-               sd->avg_r[i] = sd->avg_g[i] = sd->avg_b[i] = n;
+                       n = (guint8)((gint)(sd->avg_r[i] + sd->avg_g[i] + sd->avg_b[i]) / 3);
+                       sd->avg_r[i] = sd->avg_g[i] = sd->avg_b[i] = n;
+                       }
                }
-#endif
 }
 
 gint mround(gdouble x)
@@ -296,7 +272,6 @@ ImageSimilarityData *image_sim_new_from_pixbuf(GdkPixbuf *pixbuf)
        return sd;
 }
 
-#ifdef ALTERNATE_INCLUDE_COMPARE_CHANGE
 static gdouble alternate_image_sim_compare_fast(ImageSimilarityData *a, ImageSimilarityData *b, gdouble min)
 {
        gint sim;
@@ -331,7 +306,6 @@ static gdouble alternate_image_sim_compare_fast(ImageSimilarityData *a, ImageSim
 
        return (1.0 - ((gdouble)sim / (255.0 * 1024.0 * 4.0)) );
 }
-#endif
 
 gdouble image_sim_compare_transfo(ImageSimilarityData *a, ImageSimilarityData *b, gchar transfo)
 {
@@ -387,9 +361,10 @@ gdouble image_sim_compare_fast_transfo(ImageSimilarityData *a, ImageSimilarityDa
        gint i1, i2, *i;
        gint j1, j2, *j;
 
-#ifdef ALTERNATE_INCLUDE_COMPARE_CHANGE
-       if (alternate_enabled) return alternate_image_sim_compare_fast(a, b, min);
-#endif
+       if (options->alternate_similarity_algorithm.enabled)
+               {
+               return alternate_image_sim_compare_fast(a, b, min);
+               }
 
        if (!a || !b || !a->filled || !b->filled) return 0.0;