From 4631efe8650a512598f2a65f673c0eef5a4e49a5 Mon Sep 17 00:00:00 2001 From: Ayman Lafaz Date: Sat, 2 Oct 2021 14:13:02 +0100 Subject: [PATCH 1/4] added approximate counting implementation in python --- .../approximate_counting.md | 12 +++-- .../code/python/approximate_counting.py | 49 +++++++++++++++++++ 2 files changed, 56 insertions(+), 5 deletions(-) create mode 100644 contents/approximate_counting/code/python/approximate_counting.py diff --git a/contents/approximate_counting/approximate_counting.md b/contents/approximate_counting/approximate_counting.md index 917e79922..f63d7db43 100644 --- a/contents/approximate_counting/approximate_counting.md +++ b/contents/approximate_counting/approximate_counting.md @@ -127,7 +127,7 @@ Here is a table for the true count, approximate count, and percent error for 10, | 500,000 | 499,813.2 | 0.037 | | 1,000,000 | 999,466.0 | 0.053 | -Here, it seems that the percent error is 10 times higher when we count 10,000 items; however, +Here, it seems that the percent error is 10 times higher when we count 10,000 items; however, with these numbers, I could imagine some people reading this are thinking that we are splitting hairs. A 0.42% error is still really good, right? Right. @@ -200,7 +200,7 @@ To be clear, here is a table of several values that could be stored in a bitstri | $$00000100 = 4$$ | $$15$$ | | $$00010000 = 16$$ | $$65535$$ | | $$01000000 = 64$$ | $$1.85 \times 10^{19}$$ | -| $$10000000 = 128$$ | $$3.40 \times 10^{38}$$ | +| $$10000000 = 128$$ | $$3.40 \times 10^{38}$$ | | $$11111111 = 255$$ | $$5.79 \times 10^{76}$$ | This means that we can hold from $$0$$ to $$2^{255} - 1 \approx 5.79 \times 10^{76}$$ with 8 bits using this new method. @@ -250,7 +250,7 @@ In the next section, we will consider how to generalize this logarithmic method ## A slightly more general logarithm Let's start by considering the differences between base $$2$$ and base $$e$$. -For base $$e$$, +For base $$e$$, $$ \begin{align} @@ -283,14 +283,14 @@ Going one step further, we need to chose a specific base to a logarithm that wil $$ \begin{align} - v &= \frac{\log(1+n/a)}{\log(1+1/a)}. \\ + v &= \frac{\log(1+n/a)}{\log(1+1/a)}. \\ n_v &= a\left(\left(1+\frac{1}{a}\right)^v-1\right). \end{align} $$ Here, $$a$$ is an effective tuning parameter and sets the maximum count allowed by the bitstring and the expected error. The expression $$1+1/a$$ acts as a base for the logarithm and exponents and ensures that the first count of $$n=1$$ will also set the value $$v=1$$. -As an example, if the bitstring can be a maximum of 255 (for 8 bits) and we arbitrarily set +As an example, if the bitstring can be a maximum of 255 (for 8 bits) and we arbitrarily set $$a=30$$, then the highest possible count with this approach will be $$\approx 130,000$$, which was the number reported in Morris's paper. If we perform a few counting experiments, we find that this formula more closely tracks smaller numbers than before (when we were not using the logarithm): @@ -362,6 +362,8 @@ As we do not have any objects to count, we will instead simulate the counting wi [import, lang:"julia"](code/julia/approximate_counting.jl) {% sample lang="cpp" %} [import, lang:"cpp"](code/c++/approximate_counting.cpp) +{% sample lang="python" %} +[import, lang:"python"](code/python/approximate_counting.py) {% endmethod %} ### Bibliography diff --git a/contents/approximate_counting/code/python/approximate_counting.py b/contents/approximate_counting/code/python/approximate_counting.py new file mode 100644 index 000000000..efbac57aa --- /dev/null +++ b/contents/approximate_counting/code/python/approximate_counting.py @@ -0,0 +1,49 @@ +from random import random + +# This function takes +# - v: value in register +# - a: a scaling value for the logarithm based on Morris's paper +# It returns n(v,a), the approximate_count +def n(v, a): + return a*((1 + 1/a)**v - 1) + +# This function takes +# - v: value in register +# - a: a scaling value for the logarithm based on Morris's paper +# It returns a new value for v +def increment(v, a): + delta = 1/(n(v + 1, a) - n(v, a)) + if random() <= delta: + return v + 1 + else: + return v + +#This simulates counting and takes +# - n_items: number of items to count and loop over +# - a: a scaling value for the logarithm based on Morris's paper +# It returns n(v,a), the approximate count +def approximate_count(n_items, a): + v = 0 + for i in range(1, n_items): + v = increment(v, a) + return n(v, a) + +# This function takes +# - n_trials: the number of counting trials +# - n_items: the number of items to count to +# - a: a scaling value for the logarithm based on Morris's paper +# - threshold: the maximum percent error allowed +# It returns a true / false test value +def test_approximate_count(n_trials, n_items, a, threshold): + samples = [approximate_count(n_items, a) for i in range(1, n_trials)] + avg = sum(samples)/n_trials + + if abs((avg - n_items)/n_items) < threshold: + print("passed") + +print("testing 1,000, a = 30, 1% error") +test_approximate_count(100, 1000, 30, 0.1) +print("testing 12,345, a = 10, 1% error") +test_approximate_count(100, 12345, 10, 0.1) +print("testing 222,222, a = 0.5, 10% error") +test_approximate_count(100, 222222, 0.5, 0.2) From 157fb4a5542095294d975e954f4570fde5f44272 Mon Sep 17 00:00:00 2001 From: Ayman Lafaz Date: Sun, 3 Oct 2021 15:06:23 +0100 Subject: [PATCH 2/4] Update approximate_counting.py --- .../approximate_counting/code/python/approximate_counting.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contents/approximate_counting/code/python/approximate_counting.py b/contents/approximate_counting/code/python/approximate_counting.py index efbac57aa..eb31b2b24 100644 --- a/contents/approximate_counting/code/python/approximate_counting.py +++ b/contents/approximate_counting/code/python/approximate_counting.py @@ -24,7 +24,7 @@ def increment(v, a): # It returns n(v,a), the approximate count def approximate_count(n_items, a): v = 0 - for i in range(1, n_items): + for i in range(1, n_items + 1): v = increment(v, a) return n(v, a) @@ -35,7 +35,7 @@ def approximate_count(n_items, a): # - threshold: the maximum percent error allowed # It returns a true / false test value def test_approximate_count(n_trials, n_items, a, threshold): - samples = [approximate_count(n_items, a) for i in range(1, n_trials)] + samples = [approximate_count(n_items, a) for i in range(1, n_trials + 1)] avg = sum(samples)/n_trials if abs((avg - n_items)/n_items) < threshold: From 29044a6fbae85267577ecbfac7fb365de207eb9c Mon Sep 17 00:00:00 2001 From: Ayman Lafaz Date: Sun, 3 Oct 2021 15:14:14 +0100 Subject: [PATCH 3/4] updating approximate_counting.py --- .../approximate_counting/code/python/approximate_counting.py | 4 ++-- package.json | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/contents/approximate_counting/code/python/approximate_counting.py b/contents/approximate_counting/code/python/approximate_counting.py index efbac57aa..eb31b2b24 100644 --- a/contents/approximate_counting/code/python/approximate_counting.py +++ b/contents/approximate_counting/code/python/approximate_counting.py @@ -24,7 +24,7 @@ def increment(v, a): # It returns n(v,a), the approximate count def approximate_count(n_items, a): v = 0 - for i in range(1, n_items): + for i in range(1, n_items + 1): v = increment(v, a) return n(v, a) @@ -35,7 +35,7 @@ def approximate_count(n_items, a): # - threshold: the maximum percent error allowed # It returns a true / false test value def test_approximate_count(n_trials, n_items, a, threshold): - samples = [approximate_count(n_items, a) for i in range(1, n_trials)] + samples = [approximate_count(n_items, a) for i in range(1, n_trials + 1)] avg = sum(samples)/n_trials if abs((avg - n_items)/n_items) < threshold: diff --git a/package.json b/package.json index 2d2189e13..3a346a16f 100644 --- a/package.json +++ b/package.json @@ -16,7 +16,8 @@ "gitbook-plugin-mathjax": "git+https://github.com/algorithm-archivists/plugin-mathjax.git", "gitbook-plugin-prism": "^2.4.0", "gitbook-plugin-wordcount": "^0.0.1", - "honkit": "^3.6.16" + "honkit": "^3.6.16", + "package.json": "^2.0.1" }, "repository": { "type": "git", From 773873168ac5cc8527affd65c3b29dd450d3b511 Mon Sep 17 00:00:00 2001 From: Ayman Lafaz Date: Wed, 6 Oct 2021 03:36:21 +0100 Subject: [PATCH 4/4] removed redundancies --- package.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/package.json b/package.json index 3a346a16f..2d2189e13 100644 --- a/package.json +++ b/package.json @@ -16,8 +16,7 @@ "gitbook-plugin-mathjax": "git+https://github.com/algorithm-archivists/plugin-mathjax.git", "gitbook-plugin-prism": "^2.4.0", "gitbook-plugin-wordcount": "^0.0.1", - "honkit": "^3.6.16", - "package.json": "^2.0.1" + "honkit": "^3.6.16" }, "repository": { "type": "git",