From b4bfcae6159e1bbfe313ebbc6aff579c659f12ec Mon Sep 17 00:00:00 2001 From: Liam Thompson Date: 2023年7月24日 17:00:29 +0200 Subject: [PATCH 1/2] Cleanup copy, TODOs, URLs --- .../search/00-quick-start.ipynb | 252 +++++++++--------- 1 file changed, 126 insertions(+), 126 deletions(-) diff --git a/colab-notebooks-examples/search/00-quick-start.ipynb b/colab-notebooks-examples/search/00-quick-start.ipynb index daccb2fe..38028098 100644 --- a/colab-notebooks-examples/search/00-quick-start.ipynb +++ b/colab-notebooks-examples/search/00-quick-start.ipynb @@ -7,12 +7,12 @@ "id": "87773ce7" }, "source": [ - "# Elasticsearch Quick Start\n", + "# Elasticsearch quick start: embeddings, semantic search, and hybrid search\n", "\n", - "\\n", + "\\n", "\n", - "This interactive notebook will introduce you to the very basics of getting started with simple Elasticsearch queries, using the official [Elasticsearch Python client](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html).\n", - "We'll run through getting the client up and running, indexing a small data set into Elasticsearch, and performing basic searches against your data." + "This interactive notebook will introduce you to some basic operations with Elasticsearch, using the official [Elasticsearch Python client](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html).\n", + "You'll perform semantic search using [Sentence Transformers](https://www.sbert.net) for text embedding. Learn how to integrate traditional text-based search with semantic search, for a hybrid search system." ] }, { @@ -66,7 +66,8 @@ }, "source": [ "# Setup the Embedding Model\n", - "For this example, we're using the all-MiniLM-L6-v2, part of the sentence_transformers library. You can read more about this model in [hugging face](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)." + "\n", + "For this example, we're using `all-MiniLM-L6-v2`, part of the `sentence_transformers` library. You can read more about this model on [Huggingface](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)." ] }, { @@ -104,10 +105,10 @@ "execution_count": 30, "id": "f38e0397", "metadata": { - "id": "f38e0397", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "f38e0397", "outputId": "ad6df489-d242-4229-a42a-39c5ca19d124" }, "outputs": [ @@ -141,7 +142,7 @@ "id": "fcd165fa" }, "source": [ - "If you're running Elasticsearch locally or self-managed, you can pass in the Elasticsearch host instead. [Read more](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html#_verifying_https_with_certificate_fingerprints_python_3_10_or_later) on how to connect to Elasticsearch locally" + "If you're running Elasticsearch locally or self-managed, you can pass in the Elasticsearch host instead. [Read more](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html#_verifying_https_with_certificate_fingerprints_python_3_10_or_later) on how to connect to Elasticsearch locally." ] }, { @@ -159,16 +160,16 @@ "execution_count": 31, "id": "25c618eb", "metadata": { - "id": "25c618eb", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "25c618eb", "outputId": "30a6ba5b-5109-4457-ddfe-5633a077ca9b" }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "{'name': 'instance-0000000000', 'cluster_name': '1a56ad21587c44d3930932eb9fa1d8e8', 'cluster_uuid': 'gX4zlwtlR4qhZpp1SPm4Yg', 'version': {'number': '8.8.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '98e1271edf932a480e4262a471281f1ee295ce6b', 'build_date': '2023-06-26T05:16:16.196344851Z', 'build_snapshot': False, 'lucene_version': '9.6.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n" ] @@ -215,12 +216,6 @@ "mapping = {\n", " \"mappings\": {\n", " \"properties\": {\n", - " \"title\": {\"type\": \"text\"},\n", - " \"authors\": {\"type\": \"keyword\"},\n", - " \"summary\": {\"type\": \"text\"},\n", - " \"publish_date\": {\"type\": \"date\"},\n", - " \"num_reviews\": {\"type\": \"integer\"},\n", - " \"publisher\": {\"type\": \"keyword\"},\n", " \"title_vector\": {\n", " \"type\": \"dense_vector\",\n", " \"dims\": 384,\n", @@ -244,7 +239,8 @@ "source": [ "### Index test data\n", "\n", - "Run the following command to upload some test data, containing information about 10 popular programming books from this [dataset](https://raw.githubusercontent.com/joemcelroy/elasticsearch-labs/notebooks-guides/colab-notebooks-examples/search/data.json)." + "Run the following command to upload some test data, containing information about 10 popular programming books from this [dataset](https://raw.githubusercontent.com/elastic/elasticsearch-labs/notebooks-guides/colab-notebooks-examples/search/data.json).\n", + "`model.encode` will encode the text into a vector on the fly, using the model we initialized earlier." ] }, { @@ -315,9 +311,10 @@ "id": "39bdefe0" }, "source": [ - "##Making Queries\n", + "## Making wueries\n", "\n", - "Now that we have indexed the books, we want to perform a semantic search for books that similarly match the query. We embed the query and perform a search." + "Now that we have indexed the books, we want to perform a semantic search for books that are similar to a given query.\n", + "We embed the query and perform a search." ] }, { @@ -333,8 +330,8 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "\n", "ID: OOlWP4kB-GB5Evg6zHVx\n", @@ -429,8 +426,8 @@ ] }, { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ ":1: DeprecationWarning: The 'body' parameter is deprecated and will be removed in a future version. Instead use individual parameters.\n", " response = client.search(index=\"book_index\", body={\n" @@ -452,24 +449,29 @@ }, { "cell_type": "markdown", + "id": "LdJCpbQMeml5", + "metadata": { + "id": "LdJCpbQMeml5" + }, "source": [ "## Filtering\n", "\n", - "Filter context is mostly used for filtering structured data, for example:\n", + "Filter context is mostly used for filtering structured data. For example, use filter context to answer questions like:\n", "\n", - "Does this timestamp fall into the range 2015 to 2016?\n", - "Is the status field set to \"published\"?\n", - "Filter context is in effect whenever a query clause is passed to a filter parameter, such as the filter or must_not parameters in the bool query.\n", + "- _Does this timestamp fall into the range 2015 to 2016?_\n", + "- _Is the status field set to \"published\"?_\n", "\n", - "[Read more](https://)" - ], - "metadata": { - "id": "LdJCpbQMeml5" - }, - "id": "LdJCpbQMeml5" + "Filter context is in effect whenever a query clause is passed to a filter parameter, such as the `filter` or `must_not` parameters in a `bool` query.\n", + "\n", + "[Learn more](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-filter-context.html#filter-context) about filter context in the Elasticsearch docs." + ] }, { "cell_type": "markdown", + "id": "dRSrPMyFf7w7", + "metadata": { + "id": "dRSrPMyFf7w7" + }, "source": [ "### Example: Keyword Filtering\n", "\n", @@ -477,15 +479,17 @@ "\n", "It narrows down the results by including only documents where the \"publisher\" field is equal to \"addison-wesley\".\n", "\n", - "Overall, the code retrieves the top books that are similar to \"Best javascript books?\" based on their title vectors and have \"addison-wesley\" as the publisher." - ], - "metadata": { - "id": "dRSrPMyFf7w7" - }, - "id": "dRSrPMyFf7w7" + "The code retrieves the top books that are similar to \"Best javascript books?\" based on their title vectors and have \"addison-wesley\" as the publisher." + ] }, { "cell_type": "code", + "execution_count": null, + "id": "WoE0yTchfj3A", + "metadata": { + "id": "WoE0yTchfj3A" + }, + "outputs": [], "source": [ "response = client.search(index=\"book_index\", body={\n", " \"knn\": {\n", @@ -502,59 +506,26 @@ "})\n", "\n", "pretty_response(response)" - ], - "metadata": { - "id": "WoE0yTchfj3A" - }, - "id": "WoE0yTchfj3A", - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "### Example: Advanced Filtering\n", - "\n", - "TODO: Help the developer understand more about the different types of filtering they can do.\n", - "\n", - "Provide a link to show more advanced use cases of filtering on date-range, geo-location etc." - ], + "id": "YY2SrWDtgnF3", "metadata": { "id": "YY2SrWDtgnF3" }, - "id": "YY2SrWDtgnF3" + "source": [ + "### Example: Advanced Filtering\n", + "\n", + "Advanced filtering in Elasticsearch allows for precise search result refinement by applying conditions.\n", + "It supports a variety of operators and can be used to filter results based on specific fields, ranges, or conditions, boosting the precision and relevance of search outcomes.\n", + "Learn more in this [query and filter contexts example](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-filter-context.html#query-filter-context-ex)." + ] }, { "cell_type": "code", - "source": [ - "response = client.search(index=\"book_index\", body={\n", - " \"knn\": {\n", - " \"field\": \"title_vector\",\n", - " \"query_vector\": model.encode(\"Best javascript books?\"),\n", - " \"k\": 10,\n", - " \"num_candidates\": 100,\n", - " \"filter\": {\n", - " \"bool\": {\n", - " \"should\": [\n", - " {\n", - " \"term\": {\n", - " \"publisher\": \"addison-wesley\"\n", - " }\n", - " },\n", - " {\n", - " \"term\": {\n", - " \"authors\": \"robert c. martin\"\n", - " }\n", - " }\n", - " ],\n", - "\n", - " }\n", - " }\n", - " }\n", - "})\n", - "\n", - "pretty_response(response)" - ], + "execution_count": 43, + "id": "fcDfiJC9g6AX", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -562,12 +533,10 @@ "id": "fcDfiJC9g6AX", "outputId": "0909515a-5b94-4863-94c6-e67015baeadf" }, - "id": "fcDfiJC9g6AX", - "execution_count": 43, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "\n", "ID: MOlWP4kB-GB5Evg6zHVx\n", @@ -608,55 +577,64 @@ ] }, { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ ":1: DeprecationWarning: The 'body' parameter is deprecated and will be removed in a future version. Instead use individual parameters.\n", " response = client.search(index=\"book_index\", body={\n" ] } + ], + "source": [ + "response = client.search(index=\"book_index\", body={\n", + " \"knn\": {\n", + " \"field\": \"title_vector\",\n", + " \"query_vector\": model.encode(\"Best javascript books?\"),\n", + " \"k\": 10,\n", + " \"num_candidates\": 100,\n", + " \"filter\": {\n", + " \"bool\": {\n", + " \"should\": [\n", + " {\n", + " \"term\": {\n", + " \"publisher\": \"addison-wesley\"\n", + " }\n", + " },\n", + " {\n", + " \"term\": {\n", + " \"authors\": \"robert c. martin\"\n", + " }\n", + " }\n", + " ],\n", + "\n", + " }\n", + " }\n", + " }\n", + "})\n", + "\n", + "pretty_response(response)" ] }, { "cell_type": "markdown", + "id": "IUMOK8h-iYrq", + "metadata": { + "id": "IUMOK8h-iYrq" + }, "source": [ "## Hybrid Search\n", "\n", "In this example, we are investigating the combination of two search algorithms: BM25 for text search and HNSW for nearest neighbor search. By combining multiple ranking methods, such as BM25 and an ML model that generates dense vector embeddings, we can achieve the best ranking results. This approach allows us to leverage the strengths of each algorithm and improve the overall search performance.\n", "\n", - "TODO: Explain why we use RRF here\n" - ], - "metadata": { - "id": "IUMOK8h-iYrq" - }, - "id": "IUMOK8h-iYrq" + "[Reciprocal Rank Fusion (RRF)](https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html) is a state-of-the-art ranking algorithm for combining results from different information retrieval strategies.\n", + "RRF outperforms all other ranking algorithms without calibration.\n", + "In brief, it enables best-in-class hybrid search, out of the box." + ] }, { "cell_type": "code", - "source": [ - "response = client.search(index=\"book_index\", body={\n", - " \"query\": {\n", - " \"match\": {\n", - " \"summary\": \"python\"\n", - " }\n", - " },\n", - " \"knn\": {\n", - " \"field\": \"title_vector\",\n", - " # generate embedding for query so it can be compared to `title_vector`\n", - " \"query_vector\" : model.encode(\"python programming\").tolist(),\n", - " \"k\": 5,\n", - " \"num_candidates\": 10\n", - " },\n", - " \"rank\": {\n", - " \"rrf\": {\n", - " \"window_size\": 100,\n", - " \"rank_constant\": 20\n", - " }\n", - " }\n", - "})\n", - "\n", - "pretty_response(response)" - ], + "execution_count": 51, + "id": "1BwZ-yjli7xA", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -664,12 +642,10 @@ "id": "1BwZ-yjli7xA", "outputId": "26eea86c-5cda-42d0-ba1e-2904e2b7865a" }, - "id": "1BwZ-yjli7xA", - "execution_count": 51, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "\n", "ID: MelWP4kB-GB5Evg6zHVx\n", @@ -719,20 +695,43 @@ ] }, { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ ":1: DeprecationWarning: The 'body' parameter is deprecated and will be removed in a future version. Instead use individual parameters.\n", " response = client.search(index=\"book_index\", body={\n" ] } + ], + "source": [ + "response = client.search(index=\"book_index\", body={\n", + " \"query\": {\n", + " \"match\": {\n", + " \"summary\": \"python\"\n", + " }\n", + " },\n", + " \"knn\": {\n", + " \"field\": \"title_vector\",\n", + " # generate embedding for query so it can be compared to `title_vector`\n", + " \"query_vector\" : model.encode(\"python programming\").tolist(),\n", + " \"k\": 5,\n", + " \"num_candidates\": 10\n", + " },\n", + " \"rank\": {\n", + " \"rrf\": {\n", + " \"window_size\": 100,\n", + " \"rank_constant\": 20\n", + " }\n", + " }\n", + "})\n", + "\n", + "pretty_response(response)" ] }, { "cell_type": "code", - "source": [ - "client.indices.delete(index=\"book_index\")" - ], + "execution_count": 52, + "id": "_OAahfg-tqrf", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -740,19 +739,20 @@ "id": "_OAahfg-tqrf", "outputId": "d8f81ba4-cdc9-4e30-edf7-6d5bb16920eb" }, - "id": "_OAahfg-tqrf", - "execution_count": 52, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "ObjectApiResponse({'acknowledged': True})" ] }, + "execution_count": 52, "metadata": {}, - "execution_count": 52 + "output_type": "execute_result" } + ], + "source": [ + "client.indices.delete(index=\"book_index\")" ] } ], @@ -780,4 +780,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} From be425d7d148fd971aa74d9eeb30a923a6e10626c Mon Sep 17 00:00:00 2001 From: Liam Thompson Date: 2023年7月24日 17:10:27 +0200 Subject: [PATCH 2/2] Fix typos --- colab-notebooks-examples/search/00-quick-start.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/colab-notebooks-examples/search/00-quick-start.ipynb b/colab-notebooks-examples/search/00-quick-start.ipynb index 38028098..cb1f98a1 100644 --- a/colab-notebooks-examples/search/00-quick-start.ipynb +++ b/colab-notebooks-examples/search/00-quick-start.ipynb @@ -255,7 +255,7 @@ "import json\n", "from urllib.request import urlopen\n", "\n", - "url = \"https://raw.githubusercontent.com/joemcelroy/elasticsearch-labs/notebooks-guides/colab-notebooks-examples/search/data.json\"\n", + "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/notebooks-guides/colab-notebooks-examples/search/data.json\"\n", "response = urlopen(url)\n", "books = json.loads(response.read())\n", "\n", @@ -311,7 +311,7 @@ "id": "39bdefe0" }, "source": [ - "## Making wueries\n", + "## Making queries\n", "\n", "Now that we have indexed the books, we want to perform a semantic search for books that are similar to a given query.\n", "We embed the query and perform a search."

AltStyle によって変換されたページ (->オリジナル) /