From 8ac484c854c1b718fbbe97c9ba6a98fe5980aca3 Mon Sep 17 00:00:00 2001 From: FamALouiz Date: Sun, 9 Mar 2025 13:15:46 +0100 Subject: [PATCH 1/4] Added urban dictionary example using pydatastruct search API --- ...ydatastructs_urban_dictionary_exampl.ipynb | 697 ++++++++++++++++++ 1 file changed, 697 insertions(+) create mode 100644 docs/source/pydatastructs_urban_dictionary_exampl.ipynb diff --git a/docs/source/pydatastructs_urban_dictionary_exampl.ipynb b/docs/source/pydatastructs_urban_dictionary_exampl.ipynb new file mode 100644 index 00000000..68ec01f3 --- /dev/null +++ b/docs/source/pydatastructs_urban_dictionary_exampl.ipynb @@ -0,0 +1,697 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Analysis of the PyDataStructs string matching API\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset\n", + "\n", + "We have used the [Urban Dictionary Words and Definitions](www.kaggle.com/datasets/therohk/urban-dictionary-words-dataset) from [Urban Dictionary](https://www.urbandictionary.com/). The intent of this demo is to show how the different **pydatastructs** string matching API can be used and their performances.\n", + "\n", + "The Urban Dictionary Words and Definitions dataset contains a collection of crowdsourced slang terms and their meanings from Urban Dictionary, a popular user-driven dictionary for contemporary language and internet jargon. This dataset includes word entries, corresponding definitions, upvote/downvote counts, and other metadata, making it valuable for natural language processing (NLP) tasks, sentiment analysis, and linguistic research. Due to its informal nature, the dataset captures evolving slang, cultural references, and colloquial expressions, providing insights into modern language trends.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import API for string searching\n", + "from pydatastructs import find\n", + "\n", + "# Import util modules\n", + "import time\n", + "import pandas as pd\n", + "from pandas.core.frame import DataFrame\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\fam\\AppData\\Local\\Temp\\ipykernel_23428\\982205164.py:2: DtypeWarning: Columns (3,7,8,9,10,11) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df: DataFrame = pd.read_csv(\"urbandict-word-defs.csv\")\n" + ] + } + ], + "source": [ + "# Load dataset\n", + "df: DataFrame = pd.read_csv(\"urbandict-word-defs.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset Shape:\n", + "(1048525, 12)\n", + "Dataset Columns:\n", + "Index(['word_id', 'word', 'up_votes', 'down_votes', 'author', 'definition',\n", + " 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10',\n", + " 'Unnamed: 11'],\n", + " dtype='object')\n", + "Dataset Info:\n", + "\n", + "RangeIndex: 1048525 entries, 0 to 1048524\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 word_id 1048525 non-null int64 \n", + " 1 word 1048500 non-null object\n", + " 2 up_votes 1048497 non-null object\n", + " 3 down_votes 1048521 non-null object\n", + " 4 author 1048524 non-null object\n", + " 5 definition 1048362 non-null object\n", + " 6 Unnamed: 6 153 non-null object\n", + " 7 Unnamed: 7 34 non-null object\n", + " 8 Unnamed: 8 6 non-null object\n", + " 9 Unnamed: 9 2 non-null object\n", + " 10 Unnamed: 10 1 non-null object\n", + " 11 Unnamed: 11 1 non-null object\n", + "dtypes: int64(1), object(11)\n", + "memory usage: 96.0+ MB\n", + "None\n", + "Dataset Description:\n", + " word_id\n", + "count 1.048525e+06\n", + "mean 1.411658e+06\n", + "std 8.498497e+05\n", + "min 7.000000e+00\n", + "25% 6.826870e+05\n", + "50% 1.398224e+06\n", + "75% 2.115287e+06\n", + "max 2.856896e+06\n", + "Dataset Sample:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
word_idwordup_votesdown_votesauthordefinitionUnnamed: 6Unnamed: 7Unnamed: 8Unnamed: 9Unnamed: 10Unnamed: 11
07Janky296255dc397b2fUndesirable; less-than optimum.NaNNaNNaNNaNNaNNaN
18slumpin'1637dc397b2flow down and funky, but [knee deep] enough to ...NaNNaNNaNNaNNaNNaN
29yayeeyay1927dc397b2faffirmation; suggestion of encouragement, appr...NaNNaNNaNNaNNaNNaN
312hard-core16296d1610749anything out of our league that can be good or...NaNNaNNaNNaNNaNNaN
413brutal124540ece1efanything that makes you sweatNaNNaNNaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " word_id word up_votes down_votes author \\\n", + "0 7 Janky 296 255 dc397b2f \n", + "1 8 slumpin' 16 37 dc397b2f \n", + "2 9 yayeeyay 19 27 dc397b2f \n", + "3 12 hard-core 162 96 d1610749 \n", + "4 13 brutal 12 45 40ece1ef \n", + "\n", + " definition Unnamed: 6 Unnamed: 7 \\\n", + "0 Undesirable; less-than optimum. NaN NaN \n", + "1 low down and funky, but [knee deep] enough to ... NaN NaN \n", + "2 affirmation; suggestion of encouragement, appr... NaN NaN \n", + "3 anything out of our league that can be good or... NaN NaN \n", + "4 anything that makes you sweat NaN NaN \n", + "\n", + " Unnamed: 8 Unnamed: 9 Unnamed: 10 Unnamed: 11 \n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Display some info about dataset\n", + "print(\"Dataset Shape:\")\n", + "print(df.shape)\n", + "print(\"Dataset Columns:\")\n", + "print(df.columns)\n", + "print(\"Dataset Info:\")\n", + "print(df.info())\n", + "print(\"Dataset Description:\")\n", + "print(df.describe())\n", + "print(\"Dataset Sample:\")\n", + "display(df.head())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Search Methodologies\n", + "\n", + "We employ four different search techniques to analyze word queries within the dataset. They all use the pydatastructs backend and API.\n", + "The four algorithms are as follows:\n", + "\n", + "1. Knuth-Morris-Pratt\n", + "2. Rabin–Karp\n", + "3. Boyer-Moore\n", + "4. Z-Function\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class Searcher:\n", + " \"\"\"\n", + " Provides static methods for performing string search using various algorithms\n", + " from the pydatastructs library. It also includes a method to evaluate the performance of these\n", + " search algorithms.\n", + "\n", + " Methods\n", + " -------\n", + " pydatastrcuts_rabin_karp_serach(text: str, pattern: str) -> int\n", + " Uses the Rabin-Karp algorithm to find occurrences of a pattern in a given text.\n", + "\n", + " pydatastructs_kmp_search(text: str, pattern: str) -> int\n", + " Uses the Knuth-Morris-Pratt algorithm to find occurrences of a pattern in a given text.\n", + "\n", + " pydatastructs_boyer_moore_search(text: str, pattern: str) -> int\n", + " Uses the Boyer-Moore algorithm to find occurrences of a pattern in a given text.\n", + "\n", + " pydatastructs_z_function_search(text: str, pattern: str) -> int\n", + " Uses the Z-Function algorithm to find occurrences of a pattern in a given text.\n", + "\n", + " evaluate_search_performance(search_func, text: str, query: str) -> Tuple[int, float]\n", + " Evaluates the performance of a given search function by measuring the execution time\n", + " and the number of results found for a specific query in the text.\n", + " \"\"\"\n", + "\n", + " @staticmethod\n", + " def pydatastrcuts_rabin_karp_serach(text: str, pattern: str) -> int:\n", + " \"\"\"\n", + " Uses the Rabin-Karp algorithm to find occurrences of a pattern in a given text.\n", + "\n", + " Parameters\n", + " ----------\n", + " text : str\n", + " The text in which to search for the pattern.\n", + " pattern : str\n", + " The pattern to search for in the text.\n", + "\n", + " Returns\n", + " -------\n", + " int\n", + " The number of occurrences of the pattern found in the text.\n", + " \"\"\"\n", + " positions = find(text, pattern, algorithm='rabin_karp')\n", + " return len(positions)\n", + "\n", + " @staticmethod\n", + " def pydatastructs_kmp_search(text: str, pattern: str) -> int:\n", + " \"\"\"\n", + " Uses the Knuth-Morris-Pratt algorithm to find occurrences of a pattern in a given text.\n", + "\n", + " Parameters\n", + " ----------\n", + " text : str\n", + " The text in which to search for the pattern.\n", + " pattern : str\n", + " The pattern to search for in the text.\n", + "\n", + " Returns\n", + " -------\n", + " int\n", + " The number of occurrences of the pattern found in the text.\n", + " \"\"\"\n", + " positions = find(text, pattern, algorithm='kmp')\n", + " return len(positions)\n", + "\n", + " @staticmethod\n", + " def pydatastructs_boyer_moore_search(text: str, pattern: str) -> int:\n", + " \"\"\"\n", + " Uses the Boyer-Moore algorithm to find occurrences of a pattern in a given text.\n", + "\n", + " Parameters\n", + " ----------\n", + " text : str\n", + " The text in which to search for the pattern.\n", + " pattern : str\n", + " The pattern to search for in the text.\n", + "\n", + " Returns\n", + " -------\n", + " int\n", + " The number of occurrences of the pattern found in the text.\n", + " \"\"\"\n", + " positions = find(text, pattern, algorithm='boyer_moore')\n", + " return len(positions)\n", + "\n", + " @staticmethod\n", + " def pydatastructs_z_function_search(text: str, pattern: str) -> int:\n", + " \"\"\"\n", + " Uses the Z-Function algorithm to find occurrences of a pattern in a given text.\n", + "\n", + " Parameters\n", + " ----------\n", + " text : str\n", + " The text in which to search for the pattern.\n", + " pattern : str\n", + " The pattern to search for in the text.\n", + "\n", + " Returns\n", + " -------\n", + " int\n", + " The number of occurrences of the pattern found in the text.\n", + " \"\"\"\n", + " positions = find(text, pattern, algorithm='z_function')\n", + " return len(positions)\n", + "\n", + " # Evaluate performance of search functions\n", + " @staticmethod\n", + " def evaluate_search_performance(search_func, text: str, query: str) -> tuple[int, float]:\n", + " \"\"\"\n", + " Evaluates the performance of a given search function by measuring the execution time\n", + " and the number of results found for a specific query in the text.\n", + "\n", + " Parameters\n", + " ----------\n", + " search_func : function\n", + " The search function to evaluate. It should take two arguments: text and query.\n", + " text : str\n", + " The text in which to search for the query.\n", + " query : str\n", + " The query to search for in the text.\n", + "\n", + " Returns\n", + " -------\n", + " tuple[int, float]\n", + " A tuple containing the number of occurrences of the query found in the text\n", + " and the execution time of the search function.\n", + " \"\"\"\n", + " start_time = time.time()\n", + " result = search_func(text, query)\n", + " end_time = time.time()\n", + " execution_time = end_time - start_time\n", + " return result, execution_time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Best and worst case words for each algorithm\n", + "\n", + "boyer_moore_best: list[str] = [\"xylophone\"] # Unique letters allow large skips\n", + "boyer_moore_worst: list[str] = [\"aaaaa\"] # Repetitive patterns force O(mn)\n", + "\n", + "kmp_best: list[str] = [\"banana\"] # Repeating prefixes make KMP efficient\n", + "# No repeating substrings, so KMP has no major optimizations\n", + "kmp_worst: list[str] = [\"xyzabc\"]\n", + "\n", + "# Quickly identifies repeated prefix-suffix patterns\n", + "z_function_best: list[str] = [\"abracadabra\"]\n", + "# No repeating substrings, making Z-function similar to KMP\n", + "z_function_worst: list[str] = [\"qwerty\"]\n", + "\n", + "# Great for multi-pattern searches\n", + "rabin_karp_best: list[str] = [\"hello\", \"world\", \"search\"]\n", + "# Hash collisions can degrade performance to O(mn)\n", + "rabin_karp_worst: list[str] = [\"abcdefghij\"]\n", + "\n", + "# Test queries\n", + "test_queries = boyer_moore_best + boyer_moore_worst + kmp_best + kmp_worst + \\\n", + " z_function_best + z_function_worst + rabin_karp_best + rabin_karp_worst\n", + "\n", + "\n", + "\n", + "methods = {\n", + " \"Rabin-Karp\": Searcher.pydatastrcuts_rabin_karp_serach,\n", + " \"Knuth-Morris-Pratt\": Searcher.pydatastructs_kmp_search,\n", + " \"Boyer-Moore\": Searcher.pydatastructs_boyer_moore_search,\n", + " \"Z-Function\": Searcher.pydatastructs_z_function_search\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run evaluation\n", + "performance_results = []\n", + "text = df['word'].str.cat(sep=' ') # Concatenate all words in the dataset\n", + "for method_name, method_func in methods.items():\n", + " for query in test_queries:\n", + " num_results, exec_time = Searcher.evaluate_search_performance(\n", + " method_func, text, query)\n", + " performance_results.append(\n", + " [method_name, query, num_results, exec_time])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Results\n", + "\n", + "The comparison of the **Boyer-Moore, Knuth-Morris-Pratt (KMP), Z-function, and Rabin-Karp** string search algorithms highlights their strengths and weaknesses across different scenarios.\n", + "\n", + "- **Boyer-Moore** excels in average-case performance, often achieving **sublinear time**, making it ideal for long patterns in large alphabets. However, it suffers from **O(mn) worst-case performance** on repetitive text.\n", + "- **KMP** and **Z-function** guarantee **O(n + m) worst-case complexity**, making them more reliable for structured pattern matching, though they lack the efficiency of Boyer-Moore in general cases.\n", + "- **Rabin-Karp**, leveraging hashing, performs well in **O(n + m) average-case** but degrades to **O(nm) worst-case** due to hash collisions, making it more suitable for multiple-pattern searches.\n", + "\n", + "Ultimately, the choice depends on the text structure, pattern length, and performance guarantees required for the application.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
MethodQueryResults FoundExecution Time (s)
0Rabin-Karpcool102316.389146
1Rabin-Karplit819115.893443
2Rabin-Karpsavage12716.709997
3Knuth-Morris-Prattcool10232.048637
4Knuth-Morris-Prattlit81912.275958
5Knuth-Morris-Prattsavage1272.154522
6Boyer-Moorecool10231.486486
7Boyer-Moorelit81911.401592
8Boyer-Mooresavage1270.563430
9Z-Functioncool102324.647368
10Z-Functionlit819123.557560
11Z-Functionsavage12752.787895
\n", + "
" + ], + "text/plain": [ + " Method Query Results Found Execution Time (s)\n", + "0 Rabin-Karp cool 1023 16.389146\n", + "1 Rabin-Karp lit 8191 15.893443\n", + "2 Rabin-Karp savage 127 16.709997\n", + "3 Knuth-Morris-Pratt cool 1023 2.048637\n", + "4 Knuth-Morris-Pratt lit 8191 2.275958\n", + "5 Knuth-Morris-Pratt savage 127 2.154522\n", + "6 Boyer-Moore cool 1023 1.486486\n", + "7 Boyer-Moore lit 8191 1.401592\n", + "8 Boyer-Moore savage 127 0.563430\n", + "9 Z-Function cool 1023 24.647368\n", + "10 Z-Function lit 8191 23.557560\n", + "11 Z-Function savage 127 52.787895" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Display performance results\n", + "performance_df = pd.DataFrame(performance_results, columns=[\n", + " \"Method\", \"Query\", \"Results Found\", \"Execution Time (s)\"])\n", + "display(performance_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "r" + } + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Set the aesthetic style of the plots\n", + "sns.set_theme(context=\"notebook\", style=\"whitegrid\")\n", + "\n", + "# Create a bar plot\n", + "plt.figure(figsize=(12, 6))\n", + "sns.barplot(x=\"Query\", y=\"Execution Time (s)\",\n", + " hue=\"Method\", data=performance_df)\n", + "\n", + "# Add titles and labels\n", + "plt.title(\"Comparison of Execution Times for Different Search Algorithms\")\n", + "plt.xlabel(\"Query\")\n", + "plt.ylabel(\"Execution Time (s)\")\n", + "\n", + "# Display the plot\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 1c8d52b4187d086576b46ed000b4c09d1a7d7358 Mon Sep 17 00:00:00 2001 From: FamALouiz Date: Sun, 9 Mar 2025 13:20:34 +0100 Subject: [PATCH 2/4] Fixed notebook naming --- ..._exampl.ipynb => pydatastructs_urban_dictionary_example.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/source/{pydatastructs_urban_dictionary_exampl.ipynb => pydatastructs_urban_dictionary_example.ipynb} (100%) diff --git a/docs/source/pydatastructs_urban_dictionary_exampl.ipynb b/docs/source/pydatastructs_urban_dictionary_example.ipynb similarity index 100% rename from docs/source/pydatastructs_urban_dictionary_exampl.ipynb rename to docs/source/pydatastructs_urban_dictionary_example.ipynb From ab77142f4b790ad8766135f53efa158a0a092e03 Mon Sep 17 00:00:00 2001 From: FamALouiz Date: Sun, 9 Mar 2025 13:49:51 +0100 Subject: [PATCH 3/4] Updated notebook cache with extended run --- ...datastructs_urban_dictionary_example.ipynb | 359 ++++++++++++++---- 1 file changed, 289 insertions(+), 70 deletions(-) diff --git a/docs/source/pydatastructs_urban_dictionary_example.ipynb b/docs/source/pydatastructs_urban_dictionary_example.ipynb index 68ec01f3..017f3c1a 100644 --- a/docs/source/pydatastructs_urban_dictionary_example.ipynb +++ b/docs/source/pydatastructs_urban_dictionary_example.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -37,14 +37,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\fam\\AppData\\Local\\Temp\\ipykernel_23428\\982205164.py:2: DtypeWarning: Columns (3,7,8,9,10,11) have mixed types. Specify dtype option on import or set low_memory=False.\n", + "C:\\Users\\fam\\AppData\\Local\\Temp\\ipykernel_7024\\982205164.py:2: DtypeWarning: Columns (3,7,8,9,10,11) have mixed types. Specify dtype option on import or set low_memory=False.\n", " df: DataFrame = pd.read_csv(\"urbandict-word-defs.csv\")\n" ] } @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -277,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -417,7 +417,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -445,7 +445,6 @@ " z_function_best + z_function_worst + rabin_karp_best + rabin_karp_worst\n", "\n", "\n", - "\n", "methods = {\n", " \"Rabin-Karp\": Searcher.pydatastrcuts_rabin_karp_serach,\n", " \"Knuth-Morris-Pratt\": Searcher.pydatastructs_kmp_search,\n", @@ -456,7 +455,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -488,7 +487,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -522,105 +521,329 @@ " \n", " 0\n", " Rabin-Karp\n", - " cool\n", - " 1023\n", - " 16.389146\n", + " xylophone\n", + " 7\n", + " 16.368275\n", " \n", " \n", " 1\n", " Rabin-Karp\n", - " lit\n", - " 8191\n", - " 15.893443\n", + " aaaaa\n", + " 255\n", + " 30.503066\n", " \n", " \n", " 2\n", " Rabin-Karp\n", - " savage\n", - " 127\n", - " 16.709997\n", + " banana\n", + " 511\n", + " 63.199950\n", " \n", " \n", " 3\n", - " Knuth-Morris-Pratt\n", - " cool\n", - " 1023\n", - " 2.048637\n", + " Rabin-Karp\n", + " xyzabc\n", + " 0\n", + " 62.661349\n", " \n", " \n", " 4\n", - " Knuth-Morris-Pratt\n", - " lit\n", - " 8191\n", - " 2.275958\n", + " Rabin-Karp\n", + " abracadabra\n", + " 1\n", + " 58.821800\n", " \n", " \n", " 5\n", + " Rabin-Karp\n", + " qwerty\n", + " 63\n", + " 14.416847\n", + " \n", + " \n", + " 6\n", + " Rabin-Karp\n", + " hello\n", + " 255\n", + " 16.551707\n", + " \n", + " \n", + " 7\n", + " Rabin-Karp\n", + " world\n", + " 1023\n", + " 15.397617\n", + " \n", + " \n", + " 8\n", + " Rabin-Karp\n", + " search\n", + " 127\n", + " 56.959028\n", + " \n", + " \n", + " 9\n", + " Rabin-Karp\n", + " abcdefghij\n", + " 15\n", + " 22.469995\n", + " \n", + " \n", + " 10\n", + " Knuth-Morris-Pratt\n", + " xylophone\n", + " 7\n", + " 3.466889\n", + " \n", + " \n", + " 11\n", + " Knuth-Morris-Pratt\n", + " aaaaa\n", + " 255\n", + " 3.379942\n", + " \n", + " \n", + " 12\n", + " Knuth-Morris-Pratt\n", + " banana\n", + " 511\n", + " 3.340413\n", + " \n", + " \n", + " 13\n", + " Knuth-Morris-Pratt\n", + " xyzabc\n", + " 0\n", + " 3.506472\n", + " \n", + " \n", + " 14\n", + " Knuth-Morris-Pratt\n", + " abracadabra\n", + " 1\n", + " 3.423525\n", + " \n", + " \n", + " 15\n", + " Knuth-Morris-Pratt\n", + " qwerty\n", + " 63\n", + " 3.244726\n", + " \n", + " \n", + " 16\n", + " Knuth-Morris-Pratt\n", + " hello\n", + " 255\n", + " 3.567353\n", + " \n", + " \n", + " 17\n", + " Knuth-Morris-Pratt\n", + " world\n", + " 1023\n", + " 2.514962\n", + " \n", + " \n", + " 18\n", " Knuth-Morris-Pratt\n", - " savage\n", + " search\n", " 127\n", - " 2.154522\n", + " 2.727760\n", " \n", " \n", - " 6\n", + " 19\n", + " Knuth-Morris-Pratt\n", + " abcdefghij\n", + " 15\n", + " 2.416548\n", + " \n", + " \n", + " 20\n", " Boyer-Moore\n", - " cool\n", - " 1023\n", - " 1.486486\n", + " xylophone\n", + " 7\n", + " 0.409917\n", " \n", " \n", - " 7\n", + " 21\n", " Boyer-Moore\n", - " lit\n", - " 8191\n", - " 1.401592\n", + " aaaaa\n", + " 255\n", + " 0.582294\n", " \n", " \n", - " 8\n", + " 22\n", + " Boyer-Moore\n", + " banana\n", + " 511\n", + " 0.765548\n", + " \n", + " \n", + " 23\n", " Boyer-Moore\n", - " savage\n", + " xyzabc\n", + " 0\n", + " 0.721526\n", + " \n", + " \n", + " 24\n", + " Boyer-Moore\n", + " abracadabra\n", + " 1\n", + " 0.285021\n", + " \n", + " \n", + " 25\n", + " Boyer-Moore\n", + " qwerty\n", + " 63\n", + " 1.034254\n", + " \n", + " \n", + " 26\n", + " Boyer-Moore\n", + " hello\n", + " 255\n", + " 0.670151\n", + " \n", + " \n", + " 27\n", + " Boyer-Moore\n", + " world\n", + " 1023\n", + " 0.705552\n", + " \n", + " \n", + " 28\n", + " Boyer-Moore\n", + " search\n", " 127\n", - " 0.563430\n", + " 0.847801\n", " \n", " \n", - " 9\n", + " 29\n", + " Boyer-Moore\n", + " abcdefghij\n", + " 15\n", + " 0.395121\n", + " \n", + " \n", + " 30\n", " Z-Function\n", - " cool\n", - " 1023\n", - " 24.647368\n", + " xylophone\n", + " 7\n", + " 26.620959\n", " \n", " \n", - " 10\n", + " 31\n", " Z-Function\n", - " lit\n", - " 8191\n", - " 23.557560\n", + " aaaaa\n", + " 255\n", + " 22.801260\n", " \n", " \n", - " 11\n", + " 32\n", + " Z-Function\n", + " banana\n", + " 511\n", + " 22.610493\n", + " \n", + " \n", + " 33\n", + " Z-Function\n", + " xyzabc\n", + " 0\n", + " 21.820343\n", + " \n", + " \n", + " 34\n", + " Z-Function\n", + " abracadabra\n", + " 1\n", + " 20.470267\n", + " \n", + " \n", + " 35\n", + " Z-Function\n", + " qwerty\n", + " 63\n", + " 24.497443\n", + " \n", + " \n", + " 36\n", + " Z-Function\n", + " hello\n", + " 255\n", + " 25.262055\n", + " \n", + " \n", + " 37\n", + " Z-Function\n", + " world\n", + " 1023\n", + " 24.878788\n", + " \n", + " \n", + " 38\n", " Z-Function\n", - " savage\n", + " search\n", " 127\n", - " 52.787895\n", + " 24.227696\n", + " \n", + " \n", + " 39\n", + " Z-Function\n", + " abcdefghij\n", + " 15\n", + " 24.152845\n", " \n", " \n", "\n", "" ], "text/plain": [ - " Method Query Results Found Execution Time (s)\n", - "0 Rabin-Karp cool 1023 16.389146\n", - "1 Rabin-Karp lit 8191 15.893443\n", - "2 Rabin-Karp savage 127 16.709997\n", - "3 Knuth-Morris-Pratt cool 1023 2.048637\n", - "4 Knuth-Morris-Pratt lit 8191 2.275958\n", - "5 Knuth-Morris-Pratt savage 127 2.154522\n", - "6 Boyer-Moore cool 1023 1.486486\n", - "7 Boyer-Moore lit 8191 1.401592\n", - "8 Boyer-Moore savage 127 0.563430\n", - "9 Z-Function cool 1023 24.647368\n", - "10 Z-Function lit 8191 23.557560\n", - "11 Z-Function savage 127 52.787895" + " Method Query Results Found Execution Time (s)\n", + "0 Rabin-Karp xylophone 7 16.368275\n", + "1 Rabin-Karp aaaaa 255 30.503066\n", + "2 Rabin-Karp banana 511 63.199950\n", + "3 Rabin-Karp xyzabc 0 62.661349\n", + "4 Rabin-Karp abracadabra 1 58.821800\n", + "5 Rabin-Karp qwerty 63 14.416847\n", + "6 Rabin-Karp hello 255 16.551707\n", + "7 Rabin-Karp world 1023 15.397617\n", + "8 Rabin-Karp search 127 56.959028\n", + "9 Rabin-Karp abcdefghij 15 22.469995\n", + "10 Knuth-Morris-Pratt xylophone 7 3.466889\n", + "11 Knuth-Morris-Pratt aaaaa 255 3.379942\n", + "12 Knuth-Morris-Pratt banana 511 3.340413\n", + "13 Knuth-Morris-Pratt xyzabc 0 3.506472\n", + "14 Knuth-Morris-Pratt abracadabra 1 3.423525\n", + "15 Knuth-Morris-Pratt qwerty 63 3.244726\n", + "16 Knuth-Morris-Pratt hello 255 3.567353\n", + "17 Knuth-Morris-Pratt world 1023 2.514962\n", + "18 Knuth-Morris-Pratt search 127 2.727760\n", + "19 Knuth-Morris-Pratt abcdefghij 15 2.416548\n", + "20 Boyer-Moore xylophone 7 0.409917\n", + "21 Boyer-Moore aaaaa 255 0.582294\n", + "22 Boyer-Moore banana 511 0.765548\n", + "23 Boyer-Moore xyzabc 0 0.721526\n", + "24 Boyer-Moore abracadabra 1 0.285021\n", + "25 Boyer-Moore qwerty 63 1.034254\n", + "26 Boyer-Moore hello 255 0.670151\n", + "27 Boyer-Moore world 1023 0.705552\n", + "28 Boyer-Moore search 127 0.847801\n", + "29 Boyer-Moore abcdefghij 15 0.395121\n", + "30 Z-Function xylophone 7 26.620959\n", + "31 Z-Function aaaaa 255 22.801260\n", + "32 Z-Function banana 511 22.610493\n", + "33 Z-Function xyzabc 0 21.820343\n", + "34 Z-Function abracadabra 1 20.470267\n", + "35 Z-Function qwerty 63 24.497443\n", + "36 Z-Function hello 255 25.262055\n", + "37 Z-Function world 1023 24.878788\n", + "38 Z-Function search 127 24.227696\n", + "39 Z-Function abcdefghij 15 24.152845" ] }, "metadata": {}, @@ -636,16 +859,12 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "r" - } - }, + "execution_count": 8, + "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ "
" ] From 4b1822b47f4e1a7633d34a7407154f1969be1081 Mon Sep 17 00:00:00 2001 From: FamALouiz Date: Sun, 9 Mar 2025 14:00:18 +0100 Subject: [PATCH 4/4] Fixed issue with pandas reader --- .../pydatastructs_urban_dictionary_example.ipynb | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/docs/source/pydatastructs_urban_dictionary_example.ipynb b/docs/source/pydatastructs_urban_dictionary_example.ipynb index 017f3c1a..3212a2fb 100644 --- a/docs/source/pydatastructs_urban_dictionary_example.ipynb +++ b/docs/source/pydatastructs_urban_dictionary_example.ipynb @@ -37,21 +37,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 9, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "C:\\Users\\fam\\AppData\\Local\\Temp\\ipykernel_7024\\982205164.py:2: DtypeWarning: Columns (3,7,8,9,10,11) have mixed types. Specify dtype option on import or set low_memory=False.\n", - " df: DataFrame = pd.read_csv(\"urbandict-word-defs.csv\")\n" - ] - } - ], + "outputs": [], "source": [ "# Load dataset\n", - "df: DataFrame = pd.read_csv(\"urbandict-word-defs.csv\")" + "df: DataFrame = pd.read_csv(\"urbandict-word-defs.csv\", low_memory=False)" ] }, {