|  | 
|  | 1 | +{ | 
|  | 2 | + "cells": [ | 
|  | 3 | + { | 
|  | 4 | + "cell_type": "markdown", | 
|  | 5 | + "metadata": {}, | 
|  | 6 | + "source": [ | 
|  | 7 | + "# Anthropic / VoyageAI Embeddings\n", | 
|  | 8 | + "\n", | 
|  | 9 | + "(C) 2024 by [Damir Cavar](http://damir.cavar.me/)\n", | 
|  | 10 | + "\n", | 
|  | 11 | + "The vectors from [VoyageAI](https://www.voyageai.com/) are recommended by [Anthropic](https://www.anthropic.com/). You will need to get an API key to use these vectors. \n", | 
|  | 12 | + "\n", | 
|  | 13 | + "The Python module `voyageai` is required." | 
|  | 14 | + ] | 
|  | 15 | + }, | 
|  | 16 | + { | 
|  | 17 | + "cell_type": "code", | 
|  | 18 | + "execution_count": null, | 
|  | 19 | + "metadata": {}, | 
|  | 20 | + "outputs": [], | 
|  | 21 | + "source": [ | 
|  | 22 | + "!pip install -U voyageai" | 
|  | 23 | + ] | 
|  | 24 | + }, | 
|  | 25 | + { | 
|  | 26 | + "cell_type": "markdown", | 
|  | 27 | + "metadata": {}, | 
|  | 28 | + "source": [ | 
|  | 29 | + "I store my API keys in the `secret.py` file in the same location as this notebook." | 
|  | 30 | + ] | 
|  | 31 | + }, | 
|  | 32 | + { | 
|  | 33 | + "cell_type": "code", | 
|  | 34 | + "execution_count": 17, | 
|  | 35 | + "metadata": {}, | 
|  | 36 | + "outputs": [], | 
|  | 37 | + "source": [ | 
|  | 38 | + "import voyageai\n", | 
|  | 39 | + "import csv\n", | 
|  | 40 | + "import os\n", | 
|  | 41 | + "from secret import voyageai_apikey" | 
|  | 42 | + ] | 
|  | 43 | + }, | 
|  | 44 | + { | 
|  | 45 | + "cell_type": "markdown", | 
|  | 46 | + "metadata": {}, | 
|  | 47 | + "source": [ | 
|  | 48 | + "We need to define the model for the embeddings that we want to use, as well as the maximum batch size for the list of words that we can submit to the VoyageAI API:" | 
|  | 49 | + ] | 
|  | 50 | + }, | 
|  | 51 | + { | 
|  | 52 | + "cell_type": "code", | 
|  | 53 | + "execution_count": 18, | 
|  | 54 | + "metadata": {}, | 
|  | 55 | + "outputs": [], | 
|  | 56 | + "source": [ | 
|  | 57 | + "model = \"voyage-3\"\n", | 
|  | 58 | + "batch_size = 128" | 
|  | 59 | + ] | 
|  | 60 | + }, | 
|  | 61 | + { | 
|  | 62 | + "cell_type": "markdown", | 
|  | 63 | + "metadata": {}, | 
|  | 64 | + "source": [ | 
|  | 65 | + "We can create a client now to communicate with the remote VoyageAI API:" | 
|  | 66 | + ] | 
|  | 67 | + }, | 
|  | 68 | + { | 
|  | 69 | + "cell_type": "code", | 
|  | 70 | + "execution_count": 19, | 
|  | 71 | + "metadata": {}, | 
|  | 72 | + "outputs": [], | 
|  | 73 | + "source": [ | 
|  | 74 | + "vo = voyageai.Client(api_key=voyageai_apikey)" | 
|  | 75 | + ] | 
|  | 76 | + }, | 
|  | 77 | + { | 
|  | 78 | + "cell_type": "markdown", | 
|  | 79 | + "metadata": {}, | 
|  | 80 | + "source": [ | 
|  | 81 | + "The following function requests the embeddings for a word list and returns them:" | 
|  | 82 | + ] | 
|  | 83 | + }, | 
|  | 84 | + { | 
|  | 85 | + "cell_type": "code", | 
|  | 86 | + "execution_count": 20, | 
|  | 87 | + "metadata": {}, | 
|  | 88 | + "outputs": [], | 
|  | 89 | + "source": [ | 
|  | 90 | + "def get_embeddings(wordlist):\n", | 
|  | 91 | + " return vo.embed(wordlist, model=model, input_type=\"document\").embeddings" | 
|  | 92 | + ] | 
|  | 93 | + }, | 
|  | 94 | + { | 
|  | 95 | + "cell_type": "markdown", | 
|  | 96 | + "metadata": {}, | 
|  | 97 | + "source": [ | 
|  | 98 | + "The following function saves the embeddings to a CSV file:" | 
|  | 99 | + ] | 
|  | 100 | + }, | 
|  | 101 | + { | 
|  | 102 | + "cell_type": "code", | 
|  | 103 | + "execution_count": 24, | 
|  | 104 | + "metadata": {}, | 
|  | 105 | + "outputs": [], | 
|  | 106 | + "source": [ | 
|  | 107 | + "def save_embeddings(wordlist, embeddings, output_file):\n", | 
|  | 108 | + " if not os.path.exists(output_file):\n", | 
|  | 109 | + " with open(output_file, mode='a', encoding='utf-8', newline='') as ofp:\n", | 
|  | 110 | + " writer = csv.writer(ofp)\n", | 
|  | 111 | + " header = [\"word\"] + [str(i) for i in range(len(embeddings[0]))]\n", | 
|  | 112 | + " writer.writerow(header)\n", | 
|  | 113 | + " with open(output_file, mode='a', encoding='utf-8', newline='') as ofp:\n", | 
|  | 114 | + " writer = csv.writer(ofp)\n", | 
|  | 115 | + " for word, embedding in zip(wordlist, embeddings):\n", | 
|  | 116 | + " row = [word] + embedding # Concatenate word with its embedding values\n", | 
|  | 117 | + " writer.writerow(row)" | 
|  | 118 | + ] | 
|  | 119 | + }, | 
|  | 120 | + { | 
|  | 121 | + "cell_type": "markdown", | 
|  | 122 | + "metadata": {}, | 
|  | 123 | + "source": [ | 
|  | 124 | + "Let us try a set of words:" | 
|  | 125 | + ] | 
|  | 126 | + }, | 
|  | 127 | + { | 
|  | 128 | + "cell_type": "code", | 
|  | 129 | + "execution_count": 22, | 
|  | 130 | + "metadata": {}, | 
|  | 131 | + "outputs": [], | 
|  | 132 | + "source": [ | 
|  | 133 | + "words = \"\"\"\n", | 
|  | 134 | + "cat dog bird fish\n", | 
|  | 135 | + "car truck bike bus\n", | 
|  | 136 | + "apple banana orange pear\n", | 
|  | 137 | + "\"\"\"" | 
|  | 138 | + ] | 
|  | 139 | + }, | 
|  | 140 | + { | 
|  | 141 | + "cell_type": "markdown", | 
|  | 142 | + "metadata": {}, | 
|  | 143 | + "source": [ | 
|  | 144 | + "The following loop will split the word list into a list of word lists with the maximum length of the batch size. It will request the embeddings for a batch of words and store them in the target CSV file." | 
|  | 145 | + ] | 
|  | 146 | + }, | 
|  | 147 | + { | 
|  | 148 | + "cell_type": "code", | 
|  | 149 | + "execution_count": 25, | 
|  | 150 | + "metadata": {}, | 
|  | 151 | + "outputs": [], | 
|  | 152 | + "source": [ | 
|  | 153 | + "new_words = list({ word for word in words.split() })\n", | 
|  | 154 | + "new_words_lists = [ new_words[i:i+batch_size] for i in range(0, len(new_words), batch_size) ]\n", | 
|  | 155 | + "output_file = os.path.join(\"data\", \"voyage_embeddings.csv\")\n", | 
|  | 156 | + "for nwl in new_words_lists:\n", | 
|  | 157 | + " if nwl:\n", | 
|  | 158 | + " embeddings = get_embeddings(nwl)\n", | 
|  | 159 | + " save_embeddings(nwl, embeddings, output_file)" | 
|  | 160 | + ] | 
|  | 161 | + }, | 
|  | 162 | + { | 
|  | 163 | + "cell_type": "markdown", | 
|  | 164 | + "metadata": {}, | 
|  | 165 | + "source": [ | 
|  | 166 | + "(C) 2024 by [Damir Cavar](http://damir.cavar.com/)" | 
|  | 167 | + ] | 
|  | 168 | + } | 
|  | 169 | + ], | 
|  | 170 | + "metadata": { | 
|  | 171 | + "kernelspec": { | 
|  | 172 | + "display_name": "Python 3", | 
|  | 173 | + "language": "python", | 
|  | 174 | + "name": "python3" | 
|  | 175 | + }, | 
|  | 176 | + "language_info": { | 
|  | 177 | + "codemirror_mode": { | 
|  | 178 | + "name": "ipython", | 
|  | 179 | + "version": 3 | 
|  | 180 | + }, | 
|  | 181 | + "file_extension": ".py", | 
|  | 182 | + "mimetype": "text/x-python", | 
|  | 183 | + "name": "python", | 
|  | 184 | + "nbconvert_exporter": "python", | 
|  | 185 | + "pygments_lexer": "ipython3", | 
|  | 186 | + "version": "3.12.3" | 
|  | 187 | + } | 
|  | 188 | + }, | 
|  | 189 | + "nbformat": 4, | 
|  | 190 | + "nbformat_minor": 2 | 
|  | 191 | +} | 
0 commit comments