From 569a3f4548cc525aac8bd246da213eb48a17f0e1 Mon Sep 17 00:00:00 2001
From: sundb <sundbcn@gmail.com>
Date: Thu, 1 Apr 2021 13:20:15 +0800
Subject: [PATCH] Use chi-square for random distributivity verification in test
 (#8709)

Problem:
Currently, when performing random distribution verification, we determine
the probability of each element occurring in the sum, but the probability is
only an estimate, these tests had rare sporadic failures, and we cannot verify
what the probability of failure will be.

Solution:
Using the chi-square distribution instead of the original random distribution
validation makes the test more reasonable and easier to find problems.
---
 tests/support/util.tcl   | 28 +++++++++++++++++++++-------
 tests/unit/type/hash.tcl |  6 ++++--
 tests/unit/type/set.tcl  |  6 ++++--
 tests/unit/type/zset.tcl |  6 ++++--
 4 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/tests/support/util.tcl b/tests/support/util.tcl
index c35441ab..5ea85c9e 100644
--- a/tests/support/util.tcl
+++ b/tests/support/util.tcl
@@ -682,20 +682,34 @@ proc string2printable s {
     return $res
 }
 
-# Check that probability of each element are between {min_prop} and {max_prop}.
-proc check_histogram_distribution {res min_prop max_prop} {
+# Calculation value of Chi-Square Distribution. By this value
+# we can verify the random distribution sample confidence.
+# Based on the following wiki:
+# https://en.wikipedia.org/wiki/Chi-square_distribution
+#
+# param res    Random sample list
+# return       Value of Chi-Square Distribution
+#
+# x2_value: return of chi_square_value function
+# df: Degrees of freedom, Number of independent values minus 1
+#
+# By using x2_value and df to back check the cardinality table,
+# we can know the confidence of the random sample.
+proc chi_square_value {res} {
     unset -nocomplain mydict
     foreach key $res {
         dict incr mydict $key 1
     }
 
+    set x2_value 0
+    set p [expr [llength $res] / [dict size $mydict]]
     foreach key [dict keys $mydict] {
         set value [dict get $mydict $key]
-        set probability [expr {double($value) / [llength $res]}]
-        if {$probability < $min_prop || $probability > $max_prop} {
-            return false
-        }
+
+        # Aggregate the chi-square value of each element
+        set v [expr {pow($value - $p, 2) / $p}]
+        set x2_value [expr {$x2_value + $v}]
     }
 
-    return true
+    return $x2_value
 }
diff --git a/tests/unit/type/hash.tcl b/tests/unit/type/hash.tcl
index 2eea9889..fcf97eed 100644
--- a/tests/unit/type/hash.tcl
+++ b/tests/unit/type/hash.tcl
@@ -105,8 +105,9 @@ start_server {tags {"hash"}} {
             assert_equal [llength $res] 2002
 
             # Test random uniform distribution
+            # df = 9, 40 means 0.00001 probability
             set res [r hrandfield myhash -1000]
-            assert_equal [check_histogram_distribution $res 0.05 0.15] true
+            assert_lessthan [chi_square_value $res] 40
 
             # 2) Check that all the elements actually belong to the original hash.
             foreach {key val} $res {
@@ -199,7 +200,8 @@ start_server {tags {"hash"}} {
                     }
                 }
                 assert_equal $all_ele_return true
-                assert_equal [check_histogram_distribution $allkey 0.05 0.15] true
+                # df = 9, 40 means 0.00001 probability
+                assert_lessthan [chi_square_value $allkey] 40
             }
         }
         r config set hash-max-ziplist-value $original_max_value
diff --git a/tests/unit/type/set.tcl b/tests/unit/type/set.tcl
index 4eb93a21..5548ca3a 100644
--- a/tests/unit/type/set.tcl
+++ b/tests/unit/type/set.tcl
@@ -533,8 +533,9 @@ start_server {
             }
 
             # Use negative count (PATH 1).
+            # df = 9, 40 means 0.00001 probability
             set res [r srandmember myset -1000]
-            assert_equal [check_histogram_distribution $res 0.05 0.15] true
+            assert_lessthan [chi_square_value $res] 40
 
             # Use positive count (both PATH 3 and PATH 4).
             foreach size {8 2} {
@@ -547,7 +548,8 @@ start_server {
                         lappend allkey $ele
                     }
                 }
-                assert_equal [check_histogram_distribution $allkey 0.05 0.15] true
+                # df = 9, 40 means 0.00001 probability
+                assert_lessthan [chi_square_value $allkey] 40
             }
         }
     }
diff --git a/tests/unit/type/zset.tcl b/tests/unit/type/zset.tcl
index 2456815f..0170d2bf 100644
--- a/tests/unit/type/zset.tcl
+++ b/tests/unit/type/zset.tcl
@@ -1655,8 +1655,9 @@ start_server {tags {"zset"}} {
             assert_equal [llength $res] 2002
 
             # Test random uniform distribution
+            # df = 9, 40 means 0.00001 probability
             set res [r zrandmember myzset -1000]
-            assert_equal [check_histogram_distribution $res 0.05 0.15] true
+            assert_lessthan [chi_square_value $res] 40
 
             # 2) Check that all the elements actually belong to the original zset.
             foreach {key val} $res {
@@ -1749,7 +1750,8 @@ start_server {tags {"zset"}} {
                     }
                 }
                 assert_equal $all_ele_return true
-                assert_equal [check_histogram_distribution $allkey 0.05 0.15] true
+                # df = 9, 40 means 0.00001 probability
+                assert_lessthan [chi_square_value $allkey] 40
             }
         }
         r config set zset-max-ziplist-value $original_max_value