From 2001f71ad87ce1db8b440d9becd4154a73278b9f Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2022年10月13日 11:23:16 +0530 Subject: [PATCH 01/38] Create signOfProduct.py --- signOfProduct.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 signOfProduct.py diff --git a/signOfProduct.py b/signOfProduct.py new file mode 100644 index 0000000..8302a43 --- /dev/null +++ b/signOfProduct.py @@ -0,0 +1,28 @@ +# Problem: Given an array arr[] of n integers, +# the integers can be positive, negative or 0 +# return the sign of the product of the elements +# 1 : positive +# -1 : negative +# 0 : zero + + +def getSignOfProduct(array): + + sign = 1 + + for num in array: + + if num == 0: + return 0 + + if num < 0: + sign = -1 * sign + + return sign + + +arr = [10, 45, -9, 3, -4, -5, 7, 32 , 0, 12 , 45, -1] +res = getSignOfProduct(arr) +print(arr, res) + +# Result: [10, 45, -9, 3, -4, -5, 7, 32, 0, 12, 45, -1] 0 From 18899a488d1ef5476ce051af25142b8f4e4d26d2 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2022年10月13日 11:51:33 +0530 Subject: [PATCH 02/38] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 271c3c7..7d48c3f 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ | You are getting two streams of data with dates as key, merge the two streams and return average of the values if there is a common date else just update the value as received in the stream, refer example in the code | [timeseries.py](https://github.com/devAmoghS/Python-Interview-Problems-for-Practice/blob/master/timeseries.py) | | Given two sorted array of sizes m and n in which all elements are distinct. Find the union between them Constraints: in O(m+n) complexity. | [union_arrays.py](https://github.com/devAmoghS/Python-Interview-Problems-for-Practice/blob/master/union_arrays.py) | | Username validation program | [username_validation.py](https://github.com/devAmoghS/Python-Interview-Problems-for-Practice/blob/master/username_validation.py) | -| | []() | +| Given an array of integers(+ve,-ve and 0) find the sign of the product of all the given values. | [signOfProduct.py](https://github.com/devAmoghS/Python-Interview-Problems-for-Practice/blob/master/signOfProduct.py) | | | []() | | | []() | | | []() | From 2fca69e650865a77a3b39c42dbb11b7d107f726e Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2023年2月27日 10:52:02 +0530 Subject: [PATCH 03/38] Create range_fn_float.py --- range_fn_float.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 range_fn_float.py diff --git a/range_fn_float.py b/range_fn_float.py new file mode 100644 index 0000000..51197ec --- /dev/null +++ b/range_fn_float.py @@ -0,0 +1,31 @@ +# Make a range function that works for `float` inputs + +def float_for(start, stop, increment, stop_inclusive=True): + if stop_inclusive: + stop += increment + + while start < stop: + # The yield statement returns a `generator` object to + # the one who calls the function which contains yield, + # instead of simply returning a value. + yield start + start += increment + + +for i in float_for(0.5, 0.95, 0.05): + print(i) + +""" +Output: + +0.5 +0.55 +0.6000000000000001 +0.6500000000000001 +0.7000000000000002 +0.7500000000000002 +0.8000000000000003 +0.8500000000000003 +0.9000000000000004 +0.9500000000000004 +""" From 07c6c6735ce5bd93a373565bf50932c9d6e3fb8a Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2023年3月19日 10:37:40 +0530 Subject: [PATCH 04/38] Create data_science_interviews.md --- data_science_interviews.md | 79 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 data_science_interviews.md diff --git a/data_science_interviews.md b/data_science_interviews.md new file mode 100644 index 0000000..8b43651 --- /dev/null +++ b/data_science_interviews.md @@ -0,0 +1,79 @@ +What is PEP 8 and why is it important? + +What is Scope in Python? + +What are lists and tuples? What is the key difference between the two? + +What are modules and packages in Python? + +What is self in Python? + +What are decorators in Python? + +What is lambda in Python? Why is it used? + +What are generators in Python? + +Can you create a series from the dictionary object in pandas? + + How will you delete indices, rows, and columns from a data frame? + + Can you get items of series A that are not available in another series B? + +How are NumPy arrays advantageous over python lists? + +Write python function which takes a variable number of arguments. + +WAP (Write a program) which takes a sequence of numbers and checks if all numbers are unique. + +**************************************************************************** + +How do we use Eigenvalues and eigenvectors in PCA (Principal Components Analysis) ? + +Difference between exogenous and auto regression in time series forecasting. + +Difference between normalization and standardization, will it be used before train test split or after? + +How to reduce the impact of one feature than others + +Difference between XGBoost and FBProphet. + +Describe the scenario where you do not make stationery data in time series forecasting problem + +BERT is trained on which dataset? What model will be used if BERT does not exist? Describe self- attention mechanism. + +Difference between univariate and multivariate time series forecasting problems. +**************************************************************************** + +Find the middle node of a given LinkedList. +`Used two pointer approach` +`Slow Pointer = node.next`, and +`Fast pointer = node.next.next;` +at each iteration check if any of the pointer equals to `null` +When fast pointer is null slow pointer will be at the middle node just print node.data to get the result. + + +Print all the permutations of give string. +There are two approaches for this either +we can use `permute` library or +we can code using loops in `O(n^2)`. + + +Third last node of LinkedList, +above mentioned two pointer approach will be used here as well. + +Difference between `call by value` and `call by reference`. +In call by value, we pass the copy of variable in the function whereas +in call by reference we pass the actual variable into the function. +How we do that? We pass the memory address of that variable to the function. +These concepts are used with pointers in C/C++. + +Difference between `==` and `===` in JavaScript. +Both are used for comparison +double equal to is a content comparator whereas +triple equals compares both content and data types of LHS & RHS. + +Difference between Breadth-first search & Depth first search. + + +**************************************************************************** From ca5932e6387f160de3f2e45439e89c69684f614d Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2023年3月19日 10:41:03 +0530 Subject: [PATCH 05/38] Update data_science_interviews.md --- data_science_interviews.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/data_science_interviews.md b/data_science_interviews.md index 8b43651..bdaf199 100644 --- a/data_science_interviews.md +++ b/data_science_interviews.md @@ -76,4 +76,39 @@ triple equals compares both content and data types of LHS & RHS. Difference between Breadth-first search & Depth first search. +**************************************************************************** +Explanation of the past project. What were the features used and how did you determine performance? + +What is the difference between linear regression and logistic regression? + +What is the internal working of logistic regression (LR)? + +What is the loss function of LR? + +Name some hyperparameters used in LR? Why do we use regularization? + +When do we use accuracy as a metric? When should we not use accuracy? + +How do you deal with imbalance data? + +What is SMOTE and how is it different from stratified sampling? + +Watch this video to understand how SMOTE works [https://www.youtube.com/watch?v=U3X98xZ4_no] + +What is better 0.51 AUC (Area Under the Curve) or 0.43 F1 score? Which one should you present to a client? + +Watch this video to understand how AUC is interpreted [https://www.youtube.com/watch?v=mUMd_cKU0VM] + +What does the ROC AUC value signify? + +Do we only use the threshold of 0.5 or can we use other thresholds in LR? If yes, how do we find them? + +Can I use a sales forecasting model built using pencils data to be used in erasers data? + +How would you compare the performance of two forecasting models? + +What are the different metrics used in regression analysis? Which metric should be used where? + +How do you build a testing pipeline for a data science model? [https://www.kdnuggets.com/2020/08/unit-test-data-pipeline-thank-yourself-later.html] + **************************************************************************** From 206d3005a6127574c350bebe572c82f5d9164656 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2023年3月19日 10:47:50 +0530 Subject: [PATCH 06/38] Update data_science_interviews.md --- data_science_interviews.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/data_science_interviews.md b/data_science_interviews.md index bdaf199..ba18942 100644 --- a/data_science_interviews.md +++ b/data_science_interviews.md @@ -112,3 +112,35 @@ What are the different metrics used in regression analysis? Which metric should How do you build a testing pipeline for a data science model? [https://www.kdnuggets.com/2020/08/unit-test-data-pipeline-thank-yourself-later.html] **************************************************************************** + +How does Iterators and generators work in Python ? + +What does Python constructors do and how are they useful ? + +Explain what Map function does in Python ? + +How do you flatten an image(matrix) in a deep learning architecture ? + +Difference between semantic segmentation and instance segmentation ? + +Which are the different types of pooling operations - what is the visual effect of applying a max pooling operation and average pooling operation on an image ? + +What is the math behind convolution operation – what will be the size of a particular image (128*128) after convolution operation with a 3*3 kernel ? + +what will be the size of a particular image (128*128) after convolution operation for a 3*3 image after applying 1*1 kernel ? + +What is the Loss function and optimization function of region proposal network ? + +What is Image down sampling – why do we do down sampling ? + +Python coding: Solve the following using a for loop, by defining a function and put in inside a class + +`#Input : a =[1,2,3] ` + +`#Output : ["hello1","hello2","hello3"]` + +Tradeoff between YOLO and FasterRCNN in terms of speed and accuracy ? + +What are feature maps and how are they obtained ? + +**************************************************************************** From dd2cf7fc75b5583fbb3c66e61dbb66e0b2897d03 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2023年3月19日 10:53:29 +0530 Subject: [PATCH 07/38] Update data_science_interviews.md --- data_science_interviews.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/data_science_interviews.md b/data_science_interviews.md index ba18942..a5f66f3 100644 --- a/data_science_interviews.md +++ b/data_science_interviews.md @@ -144,3 +144,41 @@ Tradeoff between YOLO and FasterRCNN in terms of speed and accuracy ? What are feature maps and how are they obtained ? **************************************************************************** +How will you count unique values in a data frame column. + +How will you convert a column data type to string ? + +How will you obtain correlation coefficient between 2 columns in a data frame ? + +How will you merge two data frame based on common column (when column name is same) ? + +How will you merge you merge two data frames base on common column name (column name is different in left and right data frame) ? + +Define the term correlation with respect to statistics ? + +What are the types of correlation coefficient? + +What is the difference in Pearson correlation coefficient and spearmen correlation coefficient? + +How do we deal with categorical variables for statistical analysis? + +How do you obtain correlation between 2 categorical variables? +How do you find Correlation between one categorical variable and other numerical variables? + +What is the difference between dictionary and list? + +How do you append a dictionary with another dictionary? + +What is the difference between tuples and list ? + +Can a tuple have different data types of element contained within it ? + +How do you read data from database directly and convert it into data frame for analysis? + +How do you import file.py function into another python file ? + +What are generators in python ? + +How will you print index and values of a list without range function ? + +**************************************************************************** From 63853dff4d0978bf4224915ac0f8de471c379188 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2023年3月19日 10:56:26 +0530 Subject: [PATCH 08/38] Update data_science_interviews.md --- data_science_interviews.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/data_science_interviews.md b/data_science_interviews.md index a5f66f3..0fca32e 100644 --- a/data_science_interviews.md +++ b/data_science_interviews.md @@ -182,3 +182,35 @@ What are generators in python ? How will you print index and values of a list without range function ? **************************************************************************** + +What is the difference between Docker and Containers? + +How do you restart containers on failure? + +How do you run a container in Docker? + +Can you run a program that takes 4 hours to run in AWS Lambda? + +What is the difference between ADD and COPY commands wrt. Dockerfile ? + +Experience with different AWS services such as CloudFormation or Glue? + +What is the schema in S3? + +Can the lambda written in AWS interact with other infrastructure? + +What is the Dockerfile setup if you want to expose the model as an API? + +Difference between UDF, pandas UDF and pyspark UDFs? + +Difference between synchronous and asynchronous request? How do you program one in Python? + +What is the use of a DAG (Directed Acyclic Graph) in Spark? + +Given the no. Of terms, print the Fibonacci sequence: Hint try both iterative and recursive methods [https://www.programiz.com/python-programming/examples/fibonacci-sequence] + +Given an input string, print the length of the longest common substring without any repeating characters. [https://leetcode.com/problems/longest-substring-without-repeating-characters/] + +Given an input string, write a function that returns the Run Length Encoded string for the input string. For example, if the input string is "ssslbbbbppiitttc", then the function should return "s3l1b4p2i2t3c1" + +**************************************************************************** From 78cc3ac59468142841de5536d4f60263d56e9d4c Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2023年3月19日 10:59:57 +0530 Subject: [PATCH 09/38] Update data_science_interviews.md --- data_science_interviews.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/data_science_interviews.md b/data_science_interviews.md index 0fca32e..1aff867 100644 --- a/data_science_interviews.md +++ b/data_science_interviews.md @@ -214,3 +214,33 @@ Given an input string, print the length of the longest common substring without Given an input string, write a function that returns the Run Length Encoded string for the input string. For example, if the input string is "ssslbbbbppiitttc", then the function should return "s3l1b4p2i2t3c1" **************************************************************************** + +Given a list, `ls = [9,8,3,4,1,0,2,7,7,6]`, write a function to get nth highest element without using any inbuilt functions or sorting. + +Write a python class with method to sort a list and related questions on classes, static methods, init etc. + +Difference between `RANK` and `DENSE RANK`? + +Difference between `parquet` and `csv` file format? How are files written in a parquet file? + +What is Cursor command in SQL? + +Difference between Spark vs MapReduce architecture? + +Explanation of ETL pipeline + +Containerization v/s virtualization + +What is port redirection in docker? + +How to create a table with Databricks storage? + +Difference between SQL and NoSQL DB? + +A scenario where data keeps on changing, with adding and updating new features , would you consider SQL or NoSQL? + +What is the difference between iterators and generators + +What is the difference between OLAP and OLTP? + +**************************************************************************** From 5d943d9e27120f459682990dcdbcec6964de6f5a Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2023年6月12日 12:04:07 +0530 Subject: [PATCH 10/38] Update README.md --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7d48c3f..e47f4b7 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,10 @@ # Python-Interview-Problems-for-Practice (now supported with *Code Style*) -[![star this repo](http://githubbadges.com/star.svg?user=devAmoghS&repo=Python-Interview-Problems-for-Practice&style=flat&color=fff&background=007ec6)](https://github.com/ddavison/github-badges) -[![fork this repo](http://githubbadges.com/fork.svg?user=devAmoghS&repo=Python-Interview-Problems-for-Practice&style=flat&color=fff&background=007ec6)](https://github.com/ddavison/github-badges/fork) +## Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=devAmoghS/Python-Interview-Problems-for-Practice&type=Date)](https://star-history.com/#devAmoghS/Python-Interview-Problems-for-Practice&Date) + + [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) ## Updates From c5f339e5e2c69c80943d24c0ae324d52e073ea3e Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2023年6月12日 12:09:37 +0530 Subject: [PATCH 11/38] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index e47f4b7..a6c70b4 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,12 @@ # Python-Interview-Problems-for-Practice (now supported with *Code Style*) +![GitHub stars](https://img.shields.io/github/stars/devAmoghS/Python-Interview-Problems-for-Practice?style=for-the-badge) ![GitHub forks](https://img.shields.io/github/forks/devAmoghS/Python-Interview-Problems-for-Practice?label=Forks&style=for-the-badge) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) ## Star History [![Star History Chart](https://api.star-history.com/svg?repos=devAmoghS/Python-Interview-Problems-for-Practice&type=Date)](https://star-history.com/#devAmoghS/Python-Interview-Problems-for-Practice&Date) -[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) + ## Updates From bec978128a01444059f5d58ecf2ac8eaf1c2dd67 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年7月30日 19:11:44 +0530 Subject: [PATCH 12/38] Create 1.What does generative truly mean.md --- .../1.What does generative truly mean.md | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 GenerativeAI/1.What does generative truly mean.md diff --git a/GenerativeAI/1.What does generative truly mean.md b/GenerativeAI/1.What does generative truly mean.md new file mode 100644 index 0000000..8ac258b --- /dev/null +++ b/GenerativeAI/1.What does generative truly mean.md @@ -0,0 +1,28 @@ +In the context of deep learning, **generative** refers to models that are capable of generating new data samples that are similar to the training data they were trained on. These models learn the underlying probability distribution of the training data and use it to create novel samples[1][2]. + +The key principles behind generative deep learning models are: + +## Learning the Data Distribution + +Generative models learn the probability distribution of the training data. This allows them to generate new samples that are statistically similar to the original data[2]. + +## Sampling from the Learned Distribution + +Once the model has learned the data distribution, it can sample from this distribution to generate new samples. This sampling process introduces randomness, which allows the model to produce varied outputs for the same input[1]. + +## Adversarial Training (GANs) + +One popular type of generative model is the Generative Adversarial Network (GAN). GANs consist of two neural networks - a generator and a discriminator. The generator generates new samples, while the discriminator tries to distinguish between real and generated samples. Through this adversarial training process, the generator learns to produce more realistic samples that can fool the discriminator[2]. + +## Variational Autoencoders (VAEs) + +Another important class of generative models are Variational Autoencoders (VAEs). VAEs learn a latent representation of the data and use this representation to generate new samples. They are trained to maximize the likelihood of the training data under the learned generative model[3]. + +In summary, generative deep learning models learn the underlying probability distribution of the training data and use this knowledge to generate novel samples that are statistically similar to the original data. This allows them to create impressive outputs like realistic images, coherent text, and plausible audio[3][4][5]. + +Citations: +[1] https://www.cmu.edu/intelligentbusiness/expertise/genai-principles.pdf +[2] https://www.sixsigmacertificationcourse.com/the-basic-principles-of-generative-models-with-an-example/ +[3] https://www.shroffpublishers.com/books/9789355429988/ +[4] https://www.amazon.in/Generative-Deep-Learning-David-Foster-ebook/dp/B0C3WVJWBF +[5] https://www.amazon.in/Deep-Learning-Scratch-Building-Principles/dp/935213902X From 9cdd6c9abb99ada84ef86a0fb9fe9b6d98ab6c6c Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年7月31日 14:27:25 +0530 Subject: [PATCH 13/38] Create 2. How does FAISS work.md --- GenerativeAI/2. How does FAISS work.md | 42 ++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 GenerativeAI/2. How does FAISS work.md diff --git a/GenerativeAI/2. How does FAISS work.md b/GenerativeAI/2. How does FAISS work.md new file mode 100644 index 0000000..3cf0b89 --- /dev/null +++ b/GenerativeAI/2. How does FAISS work.md @@ -0,0 +1,42 @@ +Here is an explanation of how Facebook AI Similarity Search (FAISS) works, from beginner to advanced, using the example query vector "I like to play football": + +## Beginner Explanation + +FAISS is a library that allows you to quickly find similar items in a large dataset of vectors. For example, if you have a sentence embedding vector for the query "I like to play football", FAISS can efficiently search through millions or billions of other sentence embedding vectors to find the ones that are most similar. + +To use FAISS, you first need to create an index from your dataset of vectors. This involves some preprocessing to optimize the index for fast similarity search. Then, when you have a query vector like "I like to play football", you can pass it to FAISS to search the index and get back the most similar vectors, ranked by similarity score. + +FAISS uses techniques like quantization and efficient distance computation to make the search much faster than a brute force approach of comparing the query to every vector in the dataset one by one. + +## Intermediate Explanation + +Let's say you have a dataset of 1 billion sentence embedding vectors, and you want to find the 10 most similar vectors to "I like to play football". Here's how FAISS would work: + +1. **Preprocessing**: FAISS builds an index data structure from the 1 billion vectors. This involves partitioning the vectors into clusters and encoding them using product quantization to reduce memory usage[1][2]. + +2. **Searching**: When you pass the query vector "I like to play football" to FAISS, it first identifies which clusters the query is closest to. It then only compares the query to the vectors within those clusters, rather than all 1 billion vectors[3]. + +3. **Ranking**: FAISS computes the similarity scores between the query and the vectors in the relevant clusters. It returns the 10 vectors with the highest scores, which are the most similar to the query[4]. + +FAISS is highly optimized for this process, using techniques like multi-threading and GPU acceleration to make the search extremely fast, even on a dataset of 1 billion vectors[1][5]. + +## Advanced Explanation + +Under the hood, FAISS uses advanced indexing algorithms to enable efficient similarity search. Some key components are: + +- **Inverted file index (IVF)**: This partitions the vector space into Voronoi cells. For a given query, FAISS first identifies the cells it is closest to, narrowing down the search[3]. + +- **Product quantization (PQ)**: Vectors are decomposed into subvectors which are quantized separately. This allows FAISS to store an approximation of the vectors very compactly in RAM[1][2]. + +- **Hierarchical navigable small world (HNSW) graph**: An efficient nearest neighbor graph structure that allows fast traversal to find similar vectors[4]. + +FAISS provides a range of indexing algorithms that make different time/accuracy/memory tradeoffs. The most accurate is IVF with PQ, which is what FAISS would likely use for a query like "I like to play football" on a large dataset[1][2][3]. + +By leveraging these advanced indexing techniques, FAISS is able to provide state-of-the-art similarity search performance, enabling applications like efficient semantic search, personalized recommendations, and content-based retrieval[4][5]. + +Citations: +[1] https://engineering.fb.com/2017/03/29/data-infrastructure/faiss-a-library-for-efficient-similarity-search/ +[2] https://ai.meta.com/tools/faiss/ +[3] https://www.pinecone.io/learn/series/faiss/faiss-tutorial/ +[4] https://www.activeloop.ai/resources/glossary/faiss-facebook-ai-similarity-search/ +[5] https://www.linkedin.com/pulse/exploring-power-facebook-ai-similarity-search-library-venkatesh-mungi-6ncof From b1a72d2ffb4640588f8680ccb93c3bf4e9739b4c Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年7月31日 14:32:00 +0530 Subject: [PATCH 14/38] Create 3. FAISS Advanced explaination.md --- .../3. FAISS Advanced explaination.md | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 GenerativeAI/3. FAISS Advanced explaination.md diff --git a/GenerativeAI/3. FAISS Advanced explaination.md b/GenerativeAI/3. FAISS Advanced explaination.md new file mode 100644 index 0000000..b33732f --- /dev/null +++ b/GenerativeAI/3. FAISS Advanced explaination.md @@ -0,0 +1,93 @@ +To expand on the advanced explanation of Facebook AI Similarity Search (FAISS) and incorporate mathematical expressions, we will delve into the underlying mechanisms and algorithms used in FAISS, using the example query vector "I like to play football". + +## Advanced Explanation of FAISS + +FAISS is designed for efficient similarity search and clustering of dense vectors, typically in high-dimensional spaces. The core idea is to index a large dataset of vectors so that we can quickly retrieve the most similar vectors to a given query vector. + +### Key Components of FAISS + +1. **Vector Representation**: + Each sentence or item is represented as a vector in a high-dimensional space. For example, the sentence "I like to play football" might be encoded into a vector $$\mathbf{q}$$ of dimension $$d$$ (e.g., $$d = 768$$ for sentence embeddings). + +2. **Distance Metrics**: + FAISS supports various distance metrics for measuring similarity between vectors, including: + + - **L2 (Euclidean) Distance**: + $$ + D(\mathbf{x}, \mathbf{y}) = \sqrt{\sum_{i=1}^{d} (x_i - y_i)^2} + $$ + - **Inner Product** (used for cosine similarity when vectors are normalized): + $$ + D(\mathbf{x}, \mathbf{y}) = \sum_{i=1}^{d} x_i \cdot y_i + $$ + +3. **Index Structures**: + FAISS employs several indexing strategies to optimize search performance: + + - **Flat Index**: This is the simplest form, where all vectors are stored, and the search is performed using brute force. For a query vector $$\mathbf{q}$,ドル the search involves calculating the distance to every vector in the index. + + - **Inverted File Index (IVF)**: This partitions the vector space into clusters. Each cluster is represented by a centroid, and vectors are assigned to these clusters. The search process involves: + 1. **Cluster Assignment**: For a query vector $$\mathbf{q}$,ドル find the nearest centroids using a coarse quantizer (e.g., using L2 distance). + 2. **Refined Search**: Only search within the nearest clusters. + + - **Product Quantization (PQ)**: This technique compresses the vector representation to save memory. It divides each vector into $$M$$ subvectors and quantizes each subvector separately. The distance computation for a query vector $$\mathbf{q}$$ involves: + $$ + D(\mathbf{q}, \mathbf{c}) \approx \sum_{m=1}^{M} D(\mathbf{q}_m, \mathbf{c}_m) + $$ + where $$\mathbf{c}_m$$ is the quantized representation of the $$m^{th}$$ subvector. + + - **Hierarchical Navigable Small World (HNSW)**: This is a graph-based approach that allows for fast nearest neighbor searches. It constructs a multi-layer graph where each layer contains a subset of the vectors, enabling efficient traversal to find nearest neighbors. + +### Example Search Process + +1. **Index Creation**: + Suppose we have a dataset of vectors representing various sentences, including our example. We would first create an index: + ```python + import faiss + d = 768 # Example dimension + index = faiss.IndexIVFPQ(faiss.IndexFlatL2(d), d, nlist=100, M=16, nbits=8) + index.train(training_vectors) # Train the index with a subset of vectors + index.add(vectors) # Add all vectors to the index + ``` + +2. **Query Vector**: + For the query "I like to play football", we encode it into a vector $$\mathbf{q}$$: + ```python + xq = model.encode(["I like to play football"]) + ``` + +3. **Search Execution**: + To find the 4 nearest neighbors, we execute: + ```python + k = 4 + D, I = index.search(xq, k) # D contains distances, I contains indices of neighbors + ``` + +4. **Distance Calculation**: + The distances $$D$$ are computed using the selected metric (e.g., L2 distance or inner product) based on the indexed vectors. + +5. **Results Interpretation**: + The indices returned in $$I$$ correspond to the most similar sentences in the dataset. For instance, if the results are: + ``` + I = [4586, 10252, 12465, 190] + ``` + These indices can be used to retrieve the actual sentences that are most similar to the query vector. + +### Performance Optimization + +FAISS is optimized for performance through various means: + +- **GPU Acceleration**: FAISS can leverage GPU resources to handle large-scale datasets, significantly speeding up both indexing and searching processes. + +- **Parameter Tuning**: Users can adjust parameters such as the number of clusters ($$nlist$$) and the number of probes ($$nprobe$$) to balance between speed and accuracy. For example, increasing $$nprobe$$ allows for more clusters to be searched, potentially improving accuracy at the cost of speed. + +- **Memory Management**: By using quantization techniques like PQ, FAISS reduces memory usage, allowing it to handle larger datasets efficiently. + +By combining these techniques, FAISS achieves state-of-the-art performance in similarity search, making it suitable for applications such as semantic search, recommendation systems, and more. + +Citations: +[1] https://github.com/facebookresearch/faiss/wiki/Faiss-indexes +[2] https://unfoldai.com/effortless-large-scale-image-retrieval-with-faiss-a-hands-on-tutorial/ +[3] https://www.youtube.com/watch?v=sKyvsdEv6rk +[4] https://www.pinecone.io/learn/series/faiss/faiss-tutorial/ +[5] https://ai.meta.com/tools/faiss/ From 570cf2bcbd9e334f3a1c2209c0202cd4c24c1a71 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年7月31日 14:52:29 +0530 Subject: [PATCH 15/38] Create References.md --- GenerativeAI/References.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 GenerativeAI/References.md diff --git a/GenerativeAI/References.md b/GenerativeAI/References.md new file mode 100644 index 0000000..7c8d3d0 --- /dev/null +++ b/GenerativeAI/References.md @@ -0,0 +1,2 @@ +FAISS Similarity Search +https://www.youtube.com/playlist?list=PLIUOU7oqGTLhlWpTz4NnuT3FekouIVlqc From 4b8e2f1b2d18219d847c65227f924a945be8cbc8 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年7月31日 19:20:41 +0530 Subject: [PATCH 16/38] Create 4. Transformers and Vector DB Interview prep.md --- ...ansformers and Vector DB Interview prep.md | 166 ++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 GenerativeAI/4. Transformers and Vector DB Interview prep.md diff --git a/GenerativeAI/4. Transformers and Vector DB Interview prep.md b/GenerativeAI/4. Transformers and Vector DB Interview prep.md new file mode 100644 index 0000000..bb44ae7 --- /dev/null +++ b/GenerativeAI/4. Transformers and Vector DB Interview prep.md @@ -0,0 +1,166 @@ +Here are 30 key questions and answers to help you prepare for a Generative AI interview, with a focus on Transformer architectures and vector databases: + +## Transformer Architectures + +1. **What are the key components of a Transformer architecture?** + - Encoder and decoder layers + - Attention mechanisms + - Feed-forward neural networks + - Layer normalization and residual connections + +2. **How does the attention mechanism work in Transformers?** + - Computes a weighted sum of values based on the compatibility between keys and queries + - Allows the model to focus on relevant parts of the input sequence + - Enables capturing long-range dependencies without relying on recurrence or convolutions + +3. **What are the advantages of using Transformer architectures compared to RNNs and CNNs?** + - Parallelization of computations + - Ability to capture long-range dependencies + - Improved performance on tasks like machine translation and language understanding + +4. **Can you explain the concept of self-attention in Transformers?** + - Attention mechanism applied to the same sequence + - Allows the model to attend to different positions within the same sequence + - Helps capture contextual information within a sequence + +5. **How do Transformer architectures handle variable-length input sequences?** + - Use of padding tokens and masking techniques + - Padding is added to ensure all sequences have the same length + - Masking is applied to ignore the contributions of padding tokens during attention computations + +6. **What are the differences between encoder-only, decoder-only, and encoder-decoder Transformer architectures?** + - Encoder-only: Used for tasks like language understanding (e.g., BERT) + - Decoder-only: Used for autoregressive tasks like language generation (e.g., GPT) + - Encoder-decoder: Used for sequence-to-sequence tasks like machine translation (e.g., Transformer) + +7. **Can you explain the concept of positional encoding in Transformer architectures?** + - Injects positional information into the input embeddings + - Enables the model to understand the relative or absolute positions of tokens in the sequence + - Common techniques include sinusoidal positional encoding and learned positional embeddings + +8. **How do Transformer architectures handle long-range dependencies compared to RNNs and CNNs?** + - Attention mechanisms allow for direct connections between distant tokens + - Reduces the path length between related tokens + - Enables better modeling of long-range dependencies + +9. **What are the challenges and limitations of Transformer architectures?** + - Quadratic complexity of attention with respect to sequence length + - Memory and computational requirements can be high for long sequences + - Potential for overfitting due to lack of inductive biases present in RNNs and CNNs + +10. **Can you discuss some recent advancements and variants of Transformer architectures?** + - Sparse Transformer: Reduces computational complexity by using sparse attention patterns + - Reformer: Uses locality-sensitive hashing to efficiently compute attention + - Longform Transformer: Designed for long-form text generation tasks + +## Vector Databases + +11. **What are vector databases, and how do they differ from traditional databases?** + - Store data in the form of high-dimensional vectors + - Optimized for similarity search and nearest neighbor retrieval + - Differ from traditional databases in terms of data structure and query types + +12. **What are the main use cases of vector databases in Generative AI?** + - Semantic search and retrieval of relevant information for generation tasks + - Storage and indexing of embeddings generated by Generative AI models + - Efficient retrieval of similar examples for few-shot learning and prompting + +13. **Can you explain the concept of approximate nearest neighbor (ANN) search in vector databases?** + - Aims to find the closest vectors to a given query vector + - Employs techniques like locality-sensitive hashing (LSH) and graph-based methods + - Provides a trade-off between search accuracy and computational efficiency + +14. **How do vector databases handle high-dimensional data?** + - Use specialized index structures like HNSW (Hierarchical Navigable Small World) graphs + - Leverage dimensionality reduction techniques like PCA or t-SNE + - Optimize for efficient storage and retrieval of high-dimensional vectors + +15. **What are some popular vector database systems used in Generative AI?** + - Pinecone: Offers a managed vector database service with support for ANN search + - Milvus: An open-source vector database with a focus on scalability and performance + - Weaviate: Combines vector search with a GraphQL API for easy integration + +16. **Can you discuss the role of vector databases in few-shot learning and prompting for Generative AI?** + - Store relevant examples or prompts as vectors + - Retrieve similar examples based on the input prompt or context + - Provide additional information or guidance to the Generative AI model + +17. **How do vector databases enable efficient retrieval of relevant information for generation tasks?** + - Store generated outputs or relevant information as vectors + - Perform similarity search to find the most relevant vectors based on the input + - Retrieve the corresponding information to guide or enhance the generation process + +18. **What are some challenges and limitations of using vector databases in Generative AI?** + - Handling dynamic updates and changes to the stored vectors + - Ensuring data privacy and security when storing sensitive information + - Balancing the trade-off between search accuracy and computational efficiency + +19. **Can you discuss the integration of vector databases with Generative AI models?** + - Seamless integration through APIs or query languages + - Ability to perform vector search and retrieval within the Generative AI pipeline + - Enables end-to-end solutions for tasks like question-answering and dialogue generation + +20. **What are some future trends and advancements in vector databases for Generative AI?** + - Improved scalability and performance for handling large-scale datasets + - Incorporation of deep learning techniques for better similarity search + - Integration with other AI technologies like knowledge graphs and reasoning engines + +## Generative AI Fundamentals + +21. **What are the key differences between discriminative and generative models in machine learning?** + - Discriminative models learn the decision boundary between classes + - Generative models learn the underlying data distribution to generate new samples + +22. **Can you explain the concept of latent space in generative models?** + - Represents a lower-dimensional space where the model encodes data features + - Enables manipulation of these features to generate new, meaningful samples + +23. **What are some common evaluation metrics used for assessing the quality of generated samples?** + - Inception Score (IS): Measures the quality and diversity of generated samples + - Fréchet Inception Distance (FID): Compares the statistics of generated samples with real samples + - Human evaluation: Relies on subjective assessments by human judges + +24. **How do you handle mode collapse in Generative Adversarial Networks (GANs)?** + - Use techniques like mini-batch discrimination and spectral normalization + - Incorporate different loss functions like WGAN-GP + - Employ data augmentation strategies to increase the diversity of training samples + +25. **Can you discuss the role of Generative AI in few-shot learning and prompt engineering?** + - Generative models can generate relevant examples or prompts for few-shot learning + - Prompts can guide the model to generate outputs that align with specific instructions or styles + - Enables efficient learning from limited data and customization of generated outputs + +26. **What are some ethical considerations when deploying Generative AI systems?** + - Potential for generating biased or harmful content + - Ensuring transparency and accountability in the decision-making process + - Addressing issues related to data privacy and security + +27. **How do you assess the quality and consistency of generated outputs from Generative AI models?** + - Employ techniques like hallucination detection and factual consistency checking + - Utilize tools like SelfCheckGPT and G-EVAL for evaluating the quality of generated text + - Combine different evaluation methods to provide a comprehensive assessment + +28. **Can you discuss the role of Generative AI in multimodal learning and generation?** + - Integrate information from multiple modalities like text, images, and audio + - Leverage cross-modal attention mechanisms to capture interactions between modalities + - Enable generation of outputs in one modality conditioned on inputs from other modalities + +29. **What are some common challenges and limitations in training Generative AI models?** + - Unstable training dynamics and mode collapse in GANs + - Difficulty in scaling up to high-resolution image generation or long-form text generation + - Ensuring consistent and coherent generation across multiple steps or iterations + +30. **Can you discuss the potential future advancements and applications of Generative AI?** + - Continued improvements in generation quality and diversity + - Expansion to new domains like video, 3D objects, and interactive environments + - Integration with other AI technologies like reasoning engines and knowledge bases + - Widespread adoption in various industries like entertainment, healthcare, and education + +Remember to tailor your answers based on your specific experience and knowledge, and feel free to explore additional resources to deepen your understanding of Transformer architectures and vector databases in the context of Generative AI. + +Citations: +[1] https://www.usebraintrust.com/hire/interview-questions/generative-ai-specialists +[2] https://www.reddit.com/r/MachineLearning/comments/17u7b19/d_genaillm_interview_prep/ +[3] https://github.com/aishwaryanr/awesome-generative-ai-guide/blob/main/interview_prep/60_gen_ai_questions.md +[4] https://incubity.ambilio.com/top-25-generative-ai-interview-questions-with-answers/ +[5] https://www.youtube.com/watch?v=F1lsFTpsQLI From a688d5a58684c5cefbcd0d3b032d1ea7309d3fa7 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年7月31日 19:23:38 +0530 Subject: [PATCH 17/38] Create 5. FAISS interview prep.md --- GenerativeAI/5. FAISS interview prep.md | 127 ++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 GenerativeAI/5. FAISS interview prep.md diff --git a/GenerativeAI/5. FAISS interview prep.md b/GenerativeAI/5. FAISS interview prep.md new file mode 100644 index 0000000..edd2b41 --- /dev/null +++ b/GenerativeAI/5. FAISS interview prep.md @@ -0,0 +1,127 @@ +Here are 30 interview questions and answers focused on FAISS (Facebook AI Similarity Search) and its applications in Generative AI, covering various scenarios you might encounter: + +## FAISS and Its Applications + +1. **What is FAISS and what are its primary uses?** + - FAISS is a library developed by Facebook AI Research designed for efficient similarity search and clustering of dense vectors. It is primarily used for tasks like nearest neighbor search in high-dimensional spaces, which is essential in applications such as image retrieval, recommendation systems, and natural language processing. + +2. **How does FAISS handle high-dimensional data?** + - FAISS employs various indexing structures, such as inverted file systems and product quantization, to efficiently manage high-dimensional data. These structures allow for fast approximate nearest neighbor searches while reducing memory usage. + +3. **What are the different types of indexes available in FAISS?** + - FAISS provides several index types, including: + - Flat Index: Exact nearest neighbor search. + - IVFFlat: Inverted file index with flat quantization for approximate search. + - HNSW: Hierarchical Navigable Small World graph for efficient approximate searches. + - PQ (Product Quantization): Reduces the dimensionality of vectors for faster searches. + +4. **Can you explain the concept of approximate nearest neighbor (ANN) search in FAISS?** + - ANN search in FAISS aims to find the closest vectors to a query vector quickly without exhaustively comparing all vectors. It uses techniques like clustering and quantization to limit the search space, trading off some accuracy for speed. + +5. **What are the advantages of using FAISS over other vector search libraries?** + - FAISS is optimized for performance, scalability, and flexibility. It supports large datasets, provides various indexing methods, and is designed to work efficiently on both CPUs and GPUs, making it suitable for high-performance applications. + +6. **How do you optimize FAISS for large-scale datasets?** + - To optimize FAISS for large datasets, you can: + - Use appropriate index types like IVFPQ or HNSW for faster searches. + - Leverage GPU acceleration for computation-heavy tasks. + - Fine-tune parameters like the number of clusters and quantization levels based on your data characteristics. + +7. **What is the role of vector embeddings in FAISS?** + - Vector embeddings represent data points in a high-dimensional space, capturing their semantic meanings. In FAISS, these embeddings are used to perform similarity searches, allowing the retrieval of similar items based on their vector representations. + +8. **Can you describe a scenario where you used FAISS in a project?** + - In a project for an e-commerce platform, I implemented FAISS to enhance the product recommendation system. By indexing product embeddings generated from user interactions, we achieved real-time recommendations based on user preferences, significantly improving user engagement. + +9. **What challenges did you face while implementing FAISS, and how did you overcome them?** + - One challenge was managing memory usage with large datasets. I addressed this by using product quantization to reduce the memory footprint of the embeddings while maintaining reasonable search accuracy. + +10. **How does FAISS compare to traditional databases for similarity search?** + - Unlike traditional databases that focus on exact matches and structured queries, FAISS is optimized for high-dimensional vector similarity searches, allowing for approximate matches that are crucial in AI applications like image and text retrieval. + +11. **What are the typical preprocessing steps before using FAISS?** + - Typical preprocessing steps include: + - Normalizing the vectors to ensure consistent distances. + - Reducing dimensionality if necessary, using techniques like PCA. + - Ensuring that the data is in the correct format for FAISS indexing. + +12. **How do you evaluate the performance of a FAISS index?** + - Performance can be evaluated using metrics such as: + - Recall: The fraction of relevant items retrieved. + - Precision: The fraction of retrieved items that are relevant. + - Latency: The time taken to perform searches. + +13. **What is the significance of the `nlist` parameter in FAISS?** + - The `nlist` parameter defines the number of clusters in an inverted file index. A higher `nlist` can improve recall but may increase search time and memory usage. Tuning this parameter is crucial for balancing performance and resource usage. + +14. **How can FAISS be integrated with machine learning models?** + - FAISS can be integrated with machine learning models by using it to index embeddings generated by those models. For example, after training a neural network to generate embeddings for images, FAISS can be used to perform similarity searches among those embeddings. + +15. **What is the role of quantization in FAISS?** + - Quantization reduces the precision of vector representations to decrease memory usage and speed up searches. FAISS supports various quantization techniques, such as scalar quantization and product quantization, to optimize performance. + +16. **Can you explain the concept of "inverted file" indexing in FAISS?** + - Inverted file indexing groups vectors into clusters and maintains a list of vectors for each cluster. This allows FAISS to quickly narrow down the search to a subset of vectors, significantly speeding up the nearest neighbor search process. + +17. **How do you handle updates to the dataset in FAISS?** + - FAISS allows for dynamic updates by adding or removing vectors from the index. However, for large-scale updates, it may be more efficient to rebuild the index periodically rather than updating it incrementally. + +18. **What are some common pitfalls when using FAISS?** + - Common pitfalls include: + - Not normalizing vectors, which can lead to inaccurate distance calculations. + - Using inappropriate index types for the data size and search requirements. + - Failing to tune parameters like `nlist` and `nprobe` for optimal performance. + +19. **How does FAISS support GPU acceleration?** + - FAISS provides a GPU module that allows for the indexing and searching of vectors on NVIDIA GPUs. This significantly speeds up operations, especially for large datasets and complex queries. + +20. **What is the `nprobe` parameter in FAISS, and how does it affect search results?** + - The `nprobe` parameter determines the number of clusters to search during a query. A higher `nprobe` increases the chances of finding relevant results but also increases search time. Tuning this parameter is essential for balancing speed and accuracy. + +21. **How can you use FAISS for clustering tasks?** + - FAISS can be used for clustering by applying algorithms like k-means on the vector embeddings. Once clusters are formed, FAISS can efficiently retrieve points belonging to specific clusters or find nearest neighbors within those clusters. + +22. **What are the trade-offs between using exact and approximate search in FAISS?** + - Exact search guarantees the most accurate results but is computationally expensive and slow for large datasets. Approximate search is faster and uses less memory but may sacrifice some accuracy, making it suitable for real-time applications. + +23. **Can FAISS be used for text similarity search? If so, how?** + - Yes, FAISS can be used for text similarity search by converting text into embeddings using models like BERT or Sentence Transformers. These embeddings can then be indexed in FAISS for efficient similarity searches. + +24. **How would you implement a recommendation system using FAISS?** + - To implement a recommendation system: + - Generate embeddings for items and users. + - Index these embeddings using FAISS. + - For a given user, retrieve similar items based on their embedding using FAISS's nearest neighbor search. + +25. **What is the role of the `metric` parameter in FAISS?** + - The `metric` parameter defines the distance metric used for similarity calculations, such as L2 (Euclidean) or inner product. The choice of metric can significantly affect the search results and should align with the data characteristics. + +26. **How do you ensure the scalability of FAISS in production environments?** + - Scalability can be ensured by: + - Using distributed computing frameworks to handle large datasets. + - Optimizing index parameters based on the expected load and query patterns. + - Regularly monitoring performance and adjusting configurations as needed. + +27. **What are some best practices for using FAISS effectively?** + - Best practices include: + - Regularly profiling and benchmarking index performance. + - Experimenting with different index types and parameters. + - Keeping the vector space normalized and well-structured for optimal searches. + +28. **How do you manage the trade-off between accuracy and speed in FAISS?** + - Manage this trade-off by tuning parameters like `nlist`, `nprobe`, and choosing the right index type based on the specific application requirements. Regular testing and validation against real-world queries can help find the right balance. + +29. **Can you discuss a specific feature of FAISS that you find particularly useful?** + - One particularly useful feature is the ability to perform multi-threaded searches, which significantly speeds up retrieval times, especially when handling large datasets in real-time applications. + +30. **What future developments do you foresee for FAISS and vector search technologies?** + - Future developments may include enhanced support for hybrid search combining structured and unstructured data, improved algorithms for dynamic indexing, and better integration with deep learning frameworks for real-time applications. + +These questions and answers should help you prepare effectively for your interview related to FAISS and its applications in Generative AI. + +Citations: +[1] https://incubity.ambilio.com/top-25-generative-ai-interview-questions-with-answers/ +[2] https://www.youtube.com/watch?v=F1lsFTpsQLI +[3] https://www.usebraintrust.com/hire/interview-questions/generative-ai-specialists +[4] https://www.reddit.com/r/MachineLearning/comments/17u7b19/d_genaillm_interview_prep/ +[5] https://blog.streamlit.io/ai-interviewer-customized-interview-preparation-with-generative-ai/ From b49bfce1b3f1273a9f110bcb665c9f6970b1287d Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年8月18日 19:11:41 +0530 Subject: [PATCH 18/38] Create 1.2 Next Word Prediction.md --- GenerativeAI/1.2 Next Word Prediction.md | 51 ++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 GenerativeAI/1.2 Next Word Prediction.md diff --git a/GenerativeAI/1.2 Next Word Prediction.md b/GenerativeAI/1.2 Next Word Prediction.md new file mode 100644 index 0000000..624e85e --- /dev/null +++ b/GenerativeAI/1.2 Next Word Prediction.md @@ -0,0 +1,51 @@ +Next word prediction is a fundamental task in Natural Language Processing (NLP) that involves predicting the most likely word to follow a given sequence of words. This task has evolved significantly with the advent of deep learning models, particularly the Transformer architecture, which has transformed the landscape of NLP. + +## Evolution of Next Word Prediction Models + +### Early Models: RNNs, LSTMs, and GRUs + +Before the introduction of Transformers, next word prediction was primarily handled by Recurrent Neural Networks (RNNs) and their variants, such as Long Short-Term Memory (LSTM) networks and Gated Recurrent Units (GRU). + +- **RNNs** maintain hidden states that capture information from previous inputs, allowing them to process sequences of data. However, they often struggle with long-range dependencies due to issues like the vanishing gradient problem. + +- **LSTMs** were designed to overcome these limitations by introducing memory cells that can store and retrieve information over longer sequences, making them effective for capturing long-term dependencies. + +- **GRUs** simplify the LSTM architecture by merging the cell state and hidden state, providing a more computationally efficient alternative while still managing to capture long-range dependencies effectively[1]. + +These models laid the groundwork for understanding sequential data and context in language, but they were limited by their sequential processing nature, which hindered parallelization and scalability. + +## The Transformer Architecture + +Introduced in the groundbreaking paper "Attention Is All You Need" by Vaswani et al. in 2017, the Transformer model revolutionized next word prediction by eliminating the recurrence mechanism entirely. Instead, it relies on a self-attention mechanism that allows it to process all words in a sequence simultaneously, capturing relationships between words regardless of their distance from each other in the text. + +### Key Components of Transformers + +1. **Self-Attention Mechanism**: This mechanism allows the model to weigh the importance of different words in the input sequence when making predictions. Each word can attend to all other words, enabling the model to capture complex dependencies and contextual relationships effectively. + +2. **Positional Encoding**: Since Transformers do not process sequences in order, they use positional encodings to retain information about the position of words within the sequence. This helps the model understand the order of words, which is crucial for language comprehension. + +3. **Encoder-Decoder Structure**: The Transformer consists of an encoder that processes the input sequence and a decoder that generates the output sequence. Each encoder and decoder layer employs self-attention and feed-forward networks, allowing for efficient learning of language patterns[2][3]. + +### Advantages of Transformers + +Transformers offer several advantages over previous models: + +- **Parallelization**: Unlike RNNs, which process inputs sequentially, Transformers can process entire sequences simultaneously, significantly speeding up training. + +- **Long-Range Dependencies**: The self-attention mechanism enables better handling of long-range dependencies, allowing the model to consider the entire context when predicting the next word. + +- **Scalability**: Transformers can be scaled up easily, leading to the development of large language models (LLMs) like GPT-3 and BERT, which have demonstrated remarkable performance across various NLP tasks, including next word prediction[4][5]. + +## Conclusion + +The transition from RNNs and their variants to the Transformer architecture marks a significant advancement in next word prediction capabilities. Transformers have not only improved the efficiency and accuracy of predictions but have also paved the way for the development of sophisticated language models that can understand and generate human-like text. This evolution underscores the importance of architectural innovations in enhancing the performance of NLP applications. + +Citations: +[1] https://www.geeksforgeeks.org/next-word-prediction-with-deep-learning-in-nlp/ +[2] https://datasciencedojo.com/blog/transformer-models/ +[3] https://en.wikipedia.org/wiki/Transformer_%28machine_learning_model%29 +[4] https://www.leewayhertz.com/decision-transformer/ +[5] https://towardsdatascience.com/transformers-141e32e69591 +[6] https://www.datacamp.com/tutorial/how-transformers-work +[7] https://www.geeksforgeeks.org/getting-started-with-transformers/ +[8] https://www.techscience.com/cmc/v78n3/55891/html From d5f1d27a064baf30c0a3dce6f1ad1059cd8704c6 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年8月18日 19:16:01 +0530 Subject: [PATCH 19/38] Create 1.3 Embedding Process. - Mathematical Intution.md --- ...edding Process. - Mathematical Intution.md | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 GenerativeAI/1.3 Embedding Process. - Mathematical Intution.md diff --git a/GenerativeAI/1.3 Embedding Process. - Mathematical Intution.md b/GenerativeAI/1.3 Embedding Process. - Mathematical Intution.md new file mode 100644 index 0000000..4194bb2 --- /dev/null +++ b/GenerativeAI/1.3 Embedding Process. - Mathematical Intution.md @@ -0,0 +1,35 @@ +## The Intuition Behind Embeddings in Transformers + +Embeddings are a fundamental component of Transformer models, allowing them to represent words and tokens as numerical vectors that can be processed by neural networks. The embedding process maps discrete tokens (like words) into a continuous vector space, where similar tokens are positioned close together. This embedding space captures semantic and syntactic relationships between tokens. + +Some key characteristics of embeddings that make them useful for Transformers: + +### Continuous Representation +Embeddings represent tokens as continuous vectors, rather than discrete indices. This allows the model to learn smooth relationships between tokens, enabling better generalization. + +### Dimensionality Reduction +High-dimensional one-hot encoded token representations are mapped to a much lower dimensional embedding space (e.g. 300 dimensions). This dimensionality reduction allows the model to efficiently process and store token representations. + +### Semantic Relationships +The embedding space encodes semantic relationships between tokens. For example, the vector for "king" - "man" + "woman" points to the vector for "queen"[1]. These relationships emerge from the training data. + +### Parallelization +Embeddings allow the model to process all tokens in parallel, rather than sequentially. This is important for the self-attention mechanism in Transformers, which computes relationships between all pairs of tokens[3]. + +### Transfer Learning +Pre-trained embeddings, like those from BERT, can be fine-tuned on specific tasks. The embeddings capture general language knowledge that can be leveraged for various applications[5]. + +### Intuitive Visualization +Embeddings can be visualized in 2D or 3D space to gain intuitions about the model's internal representations. Semantically similar tokens cluster together in the embedding space[4]. + +Mathematically, an embedding space is a manifold in which similar items are positioned closer to one another than dissimilar items[6]. The embedding process maps discrete tokens to points on this manifold, preserving semantic relationships. Transformers leverage these properties of embeddings to efficiently process and reason about language. + +Citations: +[1] https://towardsdatascience.com/analyzing-transformers-in-embedding-space-explained-ef72130a6844?gi=ecd132be68ed +[2] https://news.ycombinator.com/item?id=40497379 +[3] https://towardsdatascience.com/transformers-intuitively-and-exhaustively-explained-58a5c5df8dbb +[4] https://encord.com/blog/embeddings-machine-learning/ +[5] https://www.datacamp.com/tutorial/how-transformers-work +[6] https://stackoverflow.blog/2023/11/09/an-intuitive-introduction-to-text-embeddings/ +[7] https://www.ibm.com/think/topics/vector-embedding +[8] https://www.geeksforgeeks.org/word-embeddings-in-nlp/ From a83c6eee429d9acee6e37e2fdb761299dc4ce72a Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年8月18日 19:20:32 +0530 Subject: [PATCH 20/38] Create 1.4 Attention Block - Python Example.md --- .../1.4 Attention Block - Python Example.md | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 GenerativeAI/1.4 Attention Block - Python Example.md diff --git a/GenerativeAI/1.4 Attention Block - Python Example.md b/GenerativeAI/1.4 Attention Block - Python Example.md new file mode 100644 index 0000000..7938232 --- /dev/null +++ b/GenerativeAI/1.4 Attention Block - Python Example.md @@ -0,0 +1,88 @@ +The attention mechanism in Transformers is a powerful mathematical framework that enables models to focus on different parts of the input sequence, allowing for better understanding of context and relationships within the data. This is particularly useful in tasks such as natural language processing and image recognition. + +## Mathematical Intuition of Attention Block + +### Key Concepts + +1. **Queries, Keys, and Values**: In the context of attention, each input is transformed into three vectors: + - **Query (Q)**: Represents the item for which we want to find relevant information. + - **Key (K)**: Represents the items in the input that can provide information. + - **Value (V)**: Represents the actual information associated with each key. + +2. **Scaled Dot-Product Attention**: The attention score between queries and keys is computed using the dot product, scaled by the square root of the dimension of the key vectors, followed by a softmax operation to obtain attention weights. The output is then a weighted sum of the value vectors. + + The formula for the attention mechanism can be summarized as: + + $$ + \text{Attention}(Q, K, V) = \text{softmax}\left(\frac{QK^T}{\sqrt{d_k}}\right)V + $$ + + where $$d_k$$ is the dimension of the key vectors. + +3. **Multi-Head Attention**: Instead of performing a single attention function, multiple attention heads are used. Each head learns different representations by applying the attention mechanism independently and then concatenating their outputs. + +### End-to-End Process Example + +To illustrate the attention mechanism, we can implement a simple version using Python and NumPy. Below is a step-by-step example. + +```python +import numpy as np + +# Define input dimensions +d_model = 4 # Dimension of the model +d_k = 2 # Dimension of keys and queries +d_v = 2 # Dimension of values +num_heads = 2 # Number of attention heads + +# Sample input data (3 tokens in the sequence, each represented by a vector of size d_model) +X = np.array([[1, 0, 1, 0], + [0, 1, 0, 1], + [1, 1, 1, 1]]) + +# Randomly initialize weight matrices for queries, keys, and values +W_Q = np.random.rand(d_model, d_k) +W_K = np.random.rand(d_model, d_k) +W_V = np.random.rand(d_model, d_v) + +# Compute queries, keys, and values. @ is the Matrix Multiplication Op. +Q = X @ W_Q +K = X @ W_K +V = X @ W_V + +# Compute attention scores +scores = Q @ K.T / np.sqrt(d_k) # Scale scores +attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True) # Softmax + +# Compute output +output = attention_weights @ V + +print("Queries:\n", Q) +print("Keys:\n", K) +print("Values:\n", V) +print("Attention Weights:\n", attention_weights) +print("Output:\n", output) +``` + +### Explanation of the Code + +1. **Input Data**: We define a simple input matrix `X` representing three tokens, each with a feature vector of size `d_model`. + +2. **Weight Matrices**: Random weight matrices `W_Q`, `W_K`, and `W_V` are initialized for transforming the input into queries, keys, and values. + +3. **Computing Q, K, V**: The input matrix is multiplied by the corresponding weight matrices to obtain the queries, keys, and values. + +4. **Attention Scores**: The dot product of queries and keys is computed, scaled, and passed through a softmax function to obtain attention weights. + +5. **Output Calculation**: The final output is computed as a weighted sum of the values based on the attention weights. + +This example demonstrates the core functionality of the attention mechanism, capturing the relationships between different tokens in the input sequence. The multi-head attention can be implemented similarly by repeating the process for multiple sets of weight matrices and concatenating the results. + +Citations: +[1] https://learnopencv.com/attention-mechanism-in-transformer-neural-networks/ +[2] https://transformer-circuits.pub/2021/framework/index.html +[3] https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.html +[4] https://machinelearningmastery.com/the-transformer-attention-mechanism/ +[5] https://towardsdatascience.com/the-math-behind-multi-head-attention-in-transformers-c26cba15f625 +[6] https://nlp.seas.harvard.edu/2018/04/03/attention.html +[7] https://www.youtube.com/watch?v=kO0XdAsY5YA +[8] https://towardsdatascience.com/transformers-intuitively-and-exhaustively-explained-58a5c5df8dbb From 9bdaf979059622051d04be5e8353f2e8ad5d6dc4 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年8月18日 19:22:54 +0530 Subject: [PATCH 21/38] Create 1.5 MLP Block - Python Example.md --- .../1.5 MLP Block - Python Example.md | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 GenerativeAI/1.5 MLP Block - Python Example.md diff --git a/GenerativeAI/1.5 MLP Block - Python Example.md b/GenerativeAI/1.5 MLP Block - Python Example.md new file mode 100644 index 0000000..b0cc104 --- /dev/null +++ b/GenerativeAI/1.5 MLP Block - Python Example.md @@ -0,0 +1,71 @@ +## Multi-Layer Perceptron (MLP) in Transformers + +The Multi-Layer Perceptron (MLP) is a key component of the Transformer architecture, responsible for refining the representation of each token using a non-linear transformation. Here's the mathematical intuition behind the MLP in Transformers: + +### Mathematical Formulation + +The MLP in Transformers operates across the features of each token, applying the same non-linear transformation to each token independently. Given the output of the self-attention layer `y(m)_n` for token `n` at layer `m`, the MLP computes: + +$$ +x^{(m+1)}_n = \text{MLP}_\theta(y^{(m)}_n) +$$ + +where `\theta` represents the parameters of the MLP, which are shared across all tokens. + +The MLP typically consists of one or two hidden layers with a dimension equal to the number of features `D` (or larger). The computational cost of this step is roughly `N * D * D`, where `N` is the sequence length. + +### Example Implementation in Python and NumPy + +Here's a simple example of implementing the MLP component in Transformers using Python and NumPy: + +```python +import numpy as np + +# Define MLP parameters +D = 4 # Number of features +hidden_size = 8 # Size of the hidden layer + +# Sample input from the self-attention layer +y = np.array([[1, 0, 1, 0], + [0, 1, 0, 1], + [1, 1, 1, 1]]) + +# Initialize MLP weights +W1 = np.random.rand(D, hidden_size) +b1 = np.random.rand(1, hidden_size) +W2 = np.random.rand(hidden_size, D) +b2 = np.random.rand(1, D) + +# Compute MLP output +h = np.maximum(0, y @ W1 + b1) # ReLU activation in the hidden layer +x = h @ W2 + b2 # Linear output layer + +print("Input from self-attention layer:\n", y) +print("Output of the MLP:\n", x) +``` + +In this example: + +1. We define the MLP parameters, including the number of features `D` and the size of the hidden layer. + +2. We create a sample input `y` from the self-attention layer. + +3. We initialize the weights and biases of the MLP randomly. + +4. We compute the output of the MLP by applying the following steps: + - Compute the hidden layer activation using a ReLU non-linearity. + - Apply the output layer weights and biases to obtain the final output. + +5. Finally, we print the input from the self-attention layer and the output of the MLP. + +The MLP in Transformers acts as a non-linear feature extractor, processing the output of the self-attention layer independently for each token. It helps capture complex interactions between features and refine the representations learned by the self-attention mechanism. + +Citations: +[1] https://www.youtube.com/watch?v=kO0XdAsY5YA +[2] https://transformer-circuits.pub/2021/framework/index.html +[3] https://arxiv.org/abs/2304.10557 +[4] https://learnopencv.com/attention-mechanism-in-transformer-neural-networks/ +[5] https://arxiv.org/pdf/2304.10557.pdf +[6] https://www.youtube.com/watch?v=idVm0DMaDR4 +[7] https://towardsdatascience.com/the-math-behind-multi-head-attention-in-transformers-c26cba15f625 +[8] https://www.youtube.com/watch?v=qw7wFGgNCSU From 07bfad2cab65b13f13bd33ecdb0643ab0052b048 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年8月18日 19:28:05 +0530 Subject: [PATCH 22/38] Create 1.6 Positional Encoding - Python Example.md --- ....6 Positional Encoding - Python Example.md | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 GenerativeAI/1.6 Positional Encoding - Python Example.md diff --git a/GenerativeAI/1.6 Positional Encoding - Python Example.md b/GenerativeAI/1.6 Positional Encoding - Python Example.md new file mode 100644 index 0000000..60cb7b7 --- /dev/null +++ b/GenerativeAI/1.6 Positional Encoding - Python Example.md @@ -0,0 +1,80 @@ +## Positional Encoding in Transformers + +Positional encoding is a critical component of the Transformer architecture, designed to provide information about the position of tokens in a sequence. Unlike recurrent neural networks (RNNs), which inherently process sequences in order, Transformers process all tokens in parallel. This parallel processing means that Transformers lack an inherent understanding of the order of tokens, making positional encodings essential. + +### Mathematical Intuition + +The primary goal of positional encoding is to inject information about the position of each token in the input sequence. The positional encoding for a token at position $$ p $$ in a sequence is defined using sine and cosine functions of varying frequencies, as follows: + +- For even indices: + $$ + PE(p, 2i) = \sin\left(\frac{p}{10000^{2i/d_{\text{model}}}}\right) + $$ + +- For odd indices: + $$ + PE(p, 2i+1) = \cos\left(\frac{p}{10000^{2i/d_{\text{model}}}}\right) + $$ + +Where: +- $$ p $$ is the position of the token in the sequence. +- $$ i $$ is the dimension index. +- $$ d_{\text{model}} $$ is the total number of dimensions in the embedding. + +This formulation allows each position to have a unique encoding, and the use of sine and cosine functions ensures that the positional encodings can capture relative positions. The geometric progression of frequencies allows the model to learn to attend to relative positions effectively. + +### End-to-End Process Example + +To illustrate how positional encoding works in practice, we can implement it using Python and NumPy. Below is a step-by-step example. + +```python +import numpy as np + +def positional_encoding(max_len, d_model): + # Initialize the positional encoding matrix + pos_enc = np.zeros((max_len, d_model)) + + # Compute positional encodings + for p in range(max_len): + for i in range(0, d_model, 2): + pos_enc[p, i] = np.sin(p / (10000 ** (2 * i / d_model))) + if i + 1 < d_model: + pos_enc[p, i + 1] = np.cos(p / (10000 ** (2 * i / d_model))) + + return pos_enc + +# Example parameters +max_len = 10 # Maximum length of the input sequence +d_model = 4 # Dimension of the embedding + +# Compute positional encodings +pos_encodings = positional_encoding(max_len, d_model) + +print("Positional Encodings:\n", pos_encodings) +``` + +### Explanation of the Code + +1. **Function Definition**: The `positional_encoding` function takes two parameters: `max_len` (the maximum length of the input sequence) and `d_model` (the dimensionality of the embedding). + +2. **Matrix Initialization**: A zero matrix `pos_enc` is initialized to store the positional encodings. + +3. **Computing Encodings**: Two nested loops iterate over each position $$ p $$ and dimension $$ i $$: + - For even indices, the sine function is applied. + - For odd indices, the cosine function is applied. + +4. **Output**: The resulting positional encodings matrix is printed, showing the positional information for each position in the sequence. + +### Summary + +Positional encoding is essential in the Transformer architecture, allowing the model to incorporate information about the order of tokens in a sequence. By using sine and cosine functions, positional encodings provide unique representations for each position, enabling the model to learn relationships between tokens effectively. This approach enhances the model's ability to process sequences without losing the critical information of token order. + +Citations: +[1] https://www.geeksforgeeks.org/positional-encoding-in-transformers/ +[2] https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/ +[3] https://kazemnejad.com/blog/transformer_architecture_positional_encoding/ +[4] https://www.youtube.com/watch?v=kO0XdAsY5YA +[5] https://nlp.seas.harvard.edu/2018/04/03/attention.html +[6] https://www.linkedin.com/pulse/deep-dive-positional-encodings-transformer-neural-network-ajay-taneja +[7] https://uvadlc-notebooks.readthedocs.io/en/latest/tutorial_notebooks/tutorial6/Transformers_and_MHAttention.html +[8] https://www.youtube.com/watch?v=ZMxVe-HK174 From 94c8ca02e489717d4e2025b19d11b10a316066f4 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年8月18日 19:29:19 +0530 Subject: [PATCH 23/38] Rename 1.What does generative truly mean.md to 1.1.What does generative truly mean.md --- ...ative truly mean.md => 1.1.What does generative truly mean.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename GenerativeAI/{1.What does generative truly mean.md => 1.1.What does generative truly mean.md} (100%) diff --git a/GenerativeAI/1.What does generative truly mean.md b/GenerativeAI/1.1.What does generative truly mean.md similarity index 100% rename from GenerativeAI/1.What does generative truly mean.md rename to GenerativeAI/1.1.What does generative truly mean.md From f6348cccd5f9279992af801083041ffd97c2502e Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年8月18日 19:34:13 +0530 Subject: [PATCH 24/38] Create 1.7 End to End process of Attention.md --- .../1.7 End to End process of Attention.md | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 GenerativeAI/1.7 End to End process of Attention.md diff --git a/GenerativeAI/1.7 End to End process of Attention.md b/GenerativeAI/1.7 End to End process of Attention.md new file mode 100644 index 0000000..9884f8b --- /dev/null +++ b/GenerativeAI/1.7 End to End process of Attention.md @@ -0,0 +1,109 @@ +To illustrate the functioning of the attention block in Transformers, let's break down the process using a sample sentence, its embedding vector, and the attention mechanism step-by-step. We will also show how to compute the probability distribution of the next best word based on the attention scores. + +### Example Sentence + +Let's take the sentence: **"Life is short"**. + +### Step 1: Word Embedding + +First, we need to convert the words into embedding vectors. For simplicity, we'll use random embeddings for each word. + +```python +import numpy as np + +# Define the sentence and create a dictionary for word indices +sentence = "Life is short" +words = sentence.split() +word_to_index = {word: i for i, word in enumerate(words)} + +# Create random embeddings for each word +embedding_dim = 4 # Dimension of the embedding +embeddings = np.random.rand(len(words), embedding_dim) + +print("Word Indices:", word_to_index) +print("Word Embeddings:\n", embeddings) +``` + +### Step 2: Compute Queries, Keys, and Values + +In the attention mechanism, we need to compute the queries (Q), keys (K), and values (V) from the embeddings. We will use learned weight matrices for this purpose. + +```python +# Initialize weight matrices for Q, K, and V +W_Q = np.random.rand(embedding_dim, embedding_dim) +W_K = np.random.rand(embedding_dim, embedding_dim) +W_V = np.random.rand(embedding_dim, embedding_dim) + +# Compute Q, K, V +Q = embeddings @ W_Q +K = embeddings @ W_K +V = embeddings @ W_V + +print("Queries (Q):\n", Q) +print("Keys (K):\n", K) +print("Values (V):\n", V) +``` + +### Step 3: Compute Attention Scores + +Next, we calculate the attention scores using the dot product of the queries and keys, followed by a softmax to obtain the attention weights. + +```python +# Compute attention scores +scores = Q @ K.T / np.sqrt(embedding_dim) # Scale by the square root of the dimension +attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True) # Softmax + +print("Attention Scores:\n", scores) +print("Attention Weights:\n", attention_weights) +``` + +### Step 4: Compute Output of the Attention Block + +The output of the attention block is computed as a weighted sum of the values, using the attention weights. + +```python +# Compute the output of the attention block +output = attention_weights @ V + +print("Output of Attention Block:\n", output) +``` + +### Step 5: Probability Distribution for Next Word + +To predict the next word, we can apply a simple linear layer followed by a softmax function to the output of the attention block. This simulates how we would generate probabilities for the next word in a sequence. + +```python +# Initialize weights for the output layer +W_out = np.random.rand(embedding_dim, len(words)) + +# Compute logits +logits = output @ W_out + +# Compute probabilities using softmax +probabilities = np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True) + +print("Logits:\n", logits) +print("Probability Distribution for Next Word:\n", probabilities) +``` + +### Summary of the Process + +1. **Word Embedding**: Convert words into embedding vectors. +2. **Compute Q, K, V**: Use learned weight matrices to compute queries, keys, and values from the embeddings. +3. **Attention Scores**: Calculate scores using the dot product of queries and keys, then apply softmax to obtain attention weights. +4. **Output of Attention Block**: Compute the output as a weighted sum of the values based on the attention weights. +5. **Next Word Probability**: Generate a probability distribution for the next word using a linear transformation followed by softmax. + +### Final Output + +The final output will show the probability distribution of the next best word based on the attention mechanism applied to the input sentence. This allows the model to capture the context and relationships between the words effectively. + +Citations: +[1] https://nlp.gluon.ai/examples/sentence_embedding/self_attentive_sentence_embedding.html +[2] https://sebastianraschka.com/blog/2023/self-attention-from-scratch.html +[3] https://datascience.stackexchange.com/questions/95134/how-to-encode-a-sentence-using-an-attention-mechanism +[4] https://towardsdatascience.com/contextual-transformer-embeddings-using-self-attention-explained-with-diagrams-and-python-code-d7a9f0f4d94e?gi=0dee21177e82 +[5] https://github.com/gazelle93/Transformer-Various-Positional-Encoding +[6] https://www.linkedin.com/pulse/deep-dive-positional-encodings-transformer-neural-network-ajay-taneja +[7] https://towardsdatascience.com/all-you-need-to-know-about-attention-and-transformers-in-depth-understanding-part-1-552f0b41d021?gi=4b6a109307fe +[8] https://stackoverflow.blog/2023/11/09/an-intuitive-introduction-to-text-embeddings/ From 78b5f449ef0d565a648f1e46ed0c605fd3b7d1ee Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年9月10日 11:30:06 +0530 Subject: [PATCH 25/38] Create 6.RecursiveReferenceRAG.md --- GenerativeAI/6.RecursiveReferenceRAG.md | 88 +++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 GenerativeAI/6.RecursiveReferenceRAG.md diff --git a/GenerativeAI/6.RecursiveReferenceRAG.md b/GenerativeAI/6.RecursiveReferenceRAG.md new file mode 100644 index 0000000..fd192ca --- /dev/null +++ b/GenerativeAI/6.RecursiveReferenceRAG.md @@ -0,0 +1,88 @@ +Sure, here's an example implementation using Python and Langchain to handle document references in a RAG architecture: + +```python +from langchain.document_loaders import TextLoader +from langchain.embeddings import HuggingFaceEmbeddings +from langchain.vectorstores import Chroma +from langchain.chains import RetrievalQA +from langchain.llms import HuggingFaceHub + +class DocumentReferenceRAG: + def __init__(self, documents): + self.documents = documents + self.embeddings = HuggingFaceEmbeddings() + self.vectorstore = Chroma.from_documents(self.documents, self.embeddings) + self.llm = HuggingFaceHub(repo_id="google/flan-t5-xl") + self.qa = RetrievalQA.from_chain_type(llm=self.llm, chain_type="stuff", retriever=self.vectorstore.as_retriever()) + + def answer_question(self, question, max_recursion_depth=3): + return self._recursive_answer(question, max_recursion_depth) + + def _recursive_answer(self, question, max_recursion_depth, processed_docs=None): + if processed_docs is None: + processed_docs = set() + + result = self.qa.run(question) + processed_docs.add(result.source_documents[0].metadata['source']) + + for doc in result.source_documents: + if 'referenced_docs' in doc.metadata: + for ref_doc_link in doc.metadata['referenced_docs']: + if ref_doc_link not in processed_docs and max_recursion_depth> 0: + ref_doc = self._retrieve_document(ref_doc_link) + if ref_doc: + self.documents.append(ref_doc) + self.vectorstore = Chroma.from_documents(self.documents, self.embeddings) + self.qa = RetrievalQA.from_chain_type(llm=self.llm, chain_type="stuff", retriever=self.vectorstore.as_retriever()) + result = self._recursive_answer(question, max_recursion_depth - 1, processed_docs) + break + + return result + + def _retrieve_document(self, doc_link): + # Implement document retrieval logic based on the provided link + # For example, load the document from a file or database + loader = TextLoader(doc_link) + return loader.load()[0] + +# Example usage +doc1 = TextLoader('doc1.txt').load()[0] +doc2 = TextLoader('doc2.txt').load()[0] +doc3 = TextLoader('doc3.txt').load()[0] +doc4 = TextLoader('doc4.txt').load()[0] +doc5 = TextLoader('doc5.txt').load()[0] + +rag = DocumentReferenceRAG([doc1, doc2, doc3, doc4, doc5]) +question = "What is the relationship between document 1 and document 3?" +answer = rag.answer_question(question) +print(answer) +``` + +In this example: + +1. The `DocumentReferenceRAG` class is defined to handle the recursive retrieval and processing of documents. + +2. The `__init__` method initializes the necessary components: + - Loads the initial set of documents + - Creates document embeddings using HuggingFaceEmbeddings + - Stores the documents in a Chroma vector store + - Sets up the LLM (HuggingFaceHub) and RetrievalQA chain + +3. The `answer_question` method takes a question and an optional maximum recursion depth. It calls the `_recursive_answer` method to generate the answer. + +4. The `_recursive_answer` method implements the recursive retrieval process: + - Generates an initial answer using the RetrievalQA chain + - Checks if the generated answer references any other documents + - If referenced documents are found, recursively retrieves them using the `_retrieve_document` method + - Appends the retrieved documents to the document collection and updates the vector store and RetrievalQA chain + - Repeats the process until no new referenced documents are found or the maximum recursion depth is reached + +5. The `_retrieve_document` method is a placeholder for the actual document retrieval logic. In this example, it loads the document from a file using the `TextLoader` from Langchain. + +6. In the example usage, five documents are loaded, and the `DocumentReferenceRAG` class is instantiated with these documents. + +7. A question is asked, and the `answer_question` method is called to generate the final answer, considering the referenced documents. + +This implementation demonstrates how to extend a RAG architecture to handle document references using Langchain. The recursive retrieval process ensures that all relevant documents are considered when answering questions, even if they are referenced within other documents. + +Remember to customize the `_retrieve_document` method to match your specific document storage and retrieval mechanism. Additionally, you may want to add more error handling and optimization techniques based on your requirements. From c2ce93f96c8cc813362e85e1de85a3bb520d8a59 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年9月26日 14:20:02 +0530 Subject: [PATCH 26/38] Create study_plan.md --- DataScience/study_plan.md | 103 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 DataScience/study_plan.md diff --git a/DataScience/study_plan.md b/DataScience/study_plan.md new file mode 100644 index 0000000..4607cbf --- /dev/null +++ b/DataScience/study_plan.md @@ -0,0 +1,103 @@ +Statistics + T test + Z test + ANOVA + Chi Square + Correlation + Covariance + Hypothesis Testing + +Classic ML + Linear Regression + Logistic Regression + Regulazisation (Rigde and Lasso) + Cost Functions + Decision Tree + Random Forest + Ensemble Learning + Bagging and Boosting + XGBoost + LightGBM + +Hyperparamter Tuning + Grid Search + Random Search + HyperOpt + Feature Selection - PCA + +Normmaliztion + Imbalance Dataaet + Imputing Missing data + Handling Outliers + Cross Validation + +Clustering + K-Means clsutering + KNN + Principal Component Analysis + +Perfromance Measures + R-square + Adjusted R-square + Mean Square Error + Root Mean Square Error + MAPE + Mean Absolute Error + + Recall + Precision + Accuracy + F1-Score + ROC-AUC + Confusion Matrix + + Type1 Error + Type2 Error + True Positive Rate + False Positive Rate + + +Adavnced ML + CNN + RCNN + LSTM + Transfromers + BERT + + +Time Series + Trend + Seasonality + Irregualrity + Cyclicity + Stationality + ADF + Making data stationary + White Noise + Holt Winters + FB-Prophet + + +Drift Detection + Type of drifts + KS Test + KL Divergence + Wassertein distance + ADWIN + +NLP + Stemming + Lemmatization + TF-IDF + Word2Vec + Bag of Words models + Spacy + +MLOPS + MLFlow + Model Registry + Data Versioning + Artifacts + + + From 0f63db867982edcc98b6e7b8d22b8a1dd6fd3920 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年9月26日 18:37:04 +0530 Subject: [PATCH 27/38] Update study_plan.md --- DataScience/study_plan.md | 200 +++++++++++++++++++------------------- 1 file changed, 100 insertions(+), 100 deletions(-) diff --git a/DataScience/study_plan.md b/DataScience/study_plan.md index 4607cbf..d74c2a3 100644 --- a/DataScience/study_plan.md +++ b/DataScience/study_plan.md @@ -1,103 +1,103 @@ -Statistics - T test - Z test - ANOVA - Chi Square - Correlation - Covariance - Hypothesis Testing - -Classic ML - Linear Regression - Logistic Regression - Regulazisation (Rigde and Lasso) - Cost Functions - Decision Tree - Random Forest - Ensemble Learning - Bagging and Boosting - XGBoost - LightGBM - -Hyperparamter Tuning - Grid Search - Random Search - HyperOpt - Feature Selection - PCA - -Normmaliztion - Imbalance Dataaet - Imputing Missing data - Handling Outliers - Cross Validation - -Clustering - K-Means clsutering - KNN - Principal Component Analysis - -Perfromance Measures - R-square - Adjusted R-square - Mean Square Error - Root Mean Square Error - MAPE - Mean Absolute Error - - Recall - Precision - Accuracy - F1-Score - ROC-AUC - Confusion Matrix - - Type1 Error - Type2 Error - True Positive Rate - False Positive Rate - - -Adavnced ML - CNN - RCNN - LSTM - Transfromers - BERT - - -Time Series - Trend - Seasonality - Irregualrity - Cyclicity - Stationality - ADF - Making data stationary - White Noise - Holt Winters - FB-Prophet - - -Drift Detection - Type of drifts - KS Test - KL Divergence - Wassertein distance - ADWIN - -NLP - Stemming - Lemmatization - TF-IDF - Word2Vec - Bag of Words models - Spacy - -MLOPS - MLFlow - Model Registry - Data Versioning - Artifacts +# Statistics + * T test + * Z test + * ANOVA + * Chi Square + * Correlation + * Covariance + * Hypothesis Testing + +# Classic ML + * Linear Regression + * Logistic Regression + * Regulazisation (Rigde and Lasso) + * Cost Functions + * Decision Tree + * Random Forest + * Ensemble Learning + * Bagging and Boosting + * XGBoost + * LightGBM + +# Hyperparamter Tuning + * Grid Search + * Random Search + * HyperOpt + * Feature Selection - PCA + +# Normmaliztion + * Imbalance Dataaet + * Imputing Missing data + * Handling Outliers + * Cross Validation + +# Clustering + * K-Means clsutering + * KNN + * Principal Component Analysis + +# Perfromance Measures + * R-square + * Adjusted R-square + * Mean Square Error + * Root Mean Square Error + * MAPE + * Mean Absolute Error + + * Recall + * Precision + * Accuracy + * F1-Score + * ROC-AUC + * Confusion Matrix + + * Type1 Error + * Type2 Error + * True Positive Rate + * False Positive Rate + + +# Adavnced ML + * CNN + * RCNN + * LSTM + * Transfromers + * BERT + + +# Time Series + * Trend + * Seasonality + * Irregualrity + * Cyclicity + * Stationality + * ADF + * Making data stationary + * White Noise + * Holt Winters + * FB-Prophet + + +# Drift Detection + * Type of drifts + * KS Test + * KL Divergence + * Wassertein distance + * ADWIN + +# NLP + * Stemming + * Lemmatization + * TF-IDF + * Word2Vec + * Bag of Words models + * Spacy + +# MLOPS + * MLFlow + * Model Registry + * Data Versioning + * Artifacts From f80fe5695235d048c1a625c71cc3f4972e2a5d43 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年9月27日 10:00:08 +0530 Subject: [PATCH 28/38] Create MlAlgoCheatSheet.md --- DataScience/MlAlgoCheatSheet.md | 70 +++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 DataScience/MlAlgoCheatSheet.md diff --git a/DataScience/MlAlgoCheatSheet.md b/DataScience/MlAlgoCheatSheet.md new file mode 100644 index 0000000..ab6eefd --- /dev/null +++ b/DataScience/MlAlgoCheatSheet.md @@ -0,0 +1,70 @@ +## Popular Algorithms in Data Science + +In data science, various algorithms are employed for tasks such as regression and classification. Each algorithm has associated loss functions and performance metrics that help evaluate its effectiveness. Below is a detailed overview of popular algorithms, their loss functions, performance metrics, and caveats for their use. + +### 1. **Linear Regression** +- **Loss Function:** Mean Squared Error (MSE) +- **Performance Metrics:** R-squared (R2), Adjusted R2, Mean Absolute Error (MAE) +- **Caveats:** Sensitive to outliers; performs poorly when the relationship between features and target is non-linear. + +### 2. **Logistic Regression** +- **Loss Function:** Binary Cross-Entropy Loss (Log Loss) +- **Performance Metrics:** Accuracy, Precision, Recall, F1 Score +- **Caveats:** Assumes linearity between the independent variables and the log odds of the dependent variable; not suitable for multi-class problems without modification. + +### 3. **Decision Trees** +- **Loss Function:** Gini Impurity (for classification), Mean Squared Error (for regression) +- **Performance Metrics:** Accuracy, Mean Absolute Error (MAE), Root Mean Squared Error (RMSE) +- **Caveats:** Prone to overfitting; sensitive to small changes in data which can lead to different tree structures. + +### 4. **Support Vector Machines (SVM)** +- **Loss Function:** Hinge Loss (for classification), Epsilon-insensitive Loss (for regression) +- **Performance Metrics:** Accuracy, Precision, Recall +- **Caveats:** Computationally expensive for large datasets; requires careful tuning of hyperparameters like the kernel choice. + +### 5. **Random Forest** +- **Loss Function:** Mean Squared Error (for regression), Gini Impurity or Cross-Entropy Loss (for classification) +- **Performance Metrics:** Out-of-Bag Error, Accuracy +- **Caveats:** Can be less interpretable than simpler models; may require significant computational resources. + +### 6. **Gradient Boosting Machines (GBM)** +- **Loss Function:** Log Loss (for classification), Mean Squared Error (for regression) +- **Performance Metrics:** Log-Likelihood, RMSE +- **Caveats:** Sensitive to overfitting if not properly regularized; requires careful tuning of learning rate and tree depth. + +### 7. **Neural Networks** +- **Loss Function:** Cross-Entropy Loss (for classification), Mean Squared Error (for regression) +- **Performance Metrics:** Accuracy, F1 Score, Area Under Curve (AUC) +- **Caveats:** Requires large amounts of data; can be prone to overfitting if not regularized properly; less interpretable compared to traditional models. + +### 8. **K-Means Clustering** +- **Loss Function:** Sum of Squared Errors (SSE) +- **Performance Metrics:** Silhouette Score, Davies-Bouldin Index +- **Caveats:** Assumes spherical clusters; sensitive to initial centroid placement; requires specifying the number of clusters in advance. + +## Summary of Loss Functions and Performance Metrics + +| Algorithm | Loss Function | Performance Metrics | +|------------------------|------------------------------------|----------------------------------------| +| Linear Regression | Mean Squared Error | R2, MAE | +| Logistic Regression | Binary Cross-Entropy | Accuracy, F1 Score | +| Decision Trees | Gini Impurity / MSE | Accuracy, MAE | +| Support Vector Machines | Hinge Loss | Accuracy, Precision | +| Random Forest | MSE / Gini Impurity | Out-of-Bag Error | +| Gradient Boosting | Log Loss / MSE | RMSE | +| Neural Networks | Cross-Entropy / MSE | Accuracy, AUC | +| K-Means Clustering | Sum of Squared Errors | Silhouette Score | + +## Conclusion + +The choice of algorithm depends on the specific characteristics of the dataset and the nature of the problem being solved. Understanding the strengths and weaknesses of each algorithm helps in selecting the most appropriate one for a given task. For instance, while linear regression is simple and interpretable, it may not capture complex relationships in the data. Conversely, neural networks can model intricate patterns but require more data and computational power. + +Citations: + +[1] https://www.datacamp.com/tutorial/loss-function-in-machine-learning
+[2] https://builtin.com/machine-learning/common-loss-functions
+[3] https://www.ibm.com/think/topics/loss-function
+[4] https://neptune.ai/blog/performance-metrics-in-machine-learning-complete-guide
+[5] https://www.geeksforgeeks.org/ml-common-loss-functions/
+[6] https://www.linkedin.com/pulse/performance-metrics-loss-function-machine-learning-alok-choudhary-zou7c
+[7] https://towardsdatascience.com/estimators-loss-functions-optimizers-core-of-ml-algorithms-d603f6b0161a
From 9d2548cdfb7d116033324e66e14ce62593ef5a21 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年9月27日 11:13:05 +0530 Subject: [PATCH 29/38] Create MlAlgoKeyFormulae.md --- DataScience/MlAlgoKeyFormulae.md | 101 +++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 DataScience/MlAlgoKeyFormulae.md diff --git a/DataScience/MlAlgoKeyFormulae.md b/DataScience/MlAlgoKeyFormulae.md new file mode 100644 index 0000000..3a91891 --- /dev/null +++ b/DataScience/MlAlgoKeyFormulae.md @@ -0,0 +1,101 @@ +## Popular Algorithms in Data Science with Mathematical Formulations + +Here is an expanded overview of popular algorithms in data science, including their mathematical formulations, loss functions, performance metrics, and caveats. + +### 1. **Linear Regression** +- **Mathematical Formula:** + $$ y = \beta_0 + \beta_1 x_1 + \beta_2 x_2 + ... + \beta_n x_n + \epsilon $$ + where $$y$$ is the dependent variable, $$x_i$$ are independent variables, $$\beta_i$$ are coefficients, and $$\epsilon$$ is the error term. +- **Loss Function:** Mean Squared Error (MSE) + $$ MSE = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2 $$ +- **Performance Metrics:** R-squared (R2), Adjusted R2, Mean Absolute Error (MAE) +- **Caveats:** Sensitive to outliers; performs poorly with non-linear relationships. + +### 2. **Logistic Regression** +- **Mathematical Formula:** + $$ P(Y=1|X) = \frac{1}{1 + e^{-(\beta_0 + \beta_1 x_1 + ... + \beta_n x_n)}} $$ +- **Loss Function:** Binary Cross-Entropy Loss (Log Loss) + $$ L = -\frac{1}{n} \sum_{i=1}^{n} [y_i \log(\hat{y}_i) + (1-y_i) \log(1-\hat{y}_i)] $$ +- **Performance Metrics:** Accuracy, Precision, Recall, F1 Score +- **Caveats:** Assumes linearity in log odds; not suitable for multi-class without modification. + +### 3. **Decision Trees** +- **Mathematical Formula:** + - For classification using Gini Impurity: + $$ Gini(D) = 1 - \sum_{j=1}^{C} p_j^2 $$ + where $$p_j$$ is the proportion of class $$j$$ in dataset $$D$$. + - For regression: + $$ MSE(D) = \frac{1}{|D|} \sum_{i=1}^{|D|} (y_i - \bar{y})^2 $$ + where $$y_i$$ are the actual values and $$\bar{y}$$ is the mean of $$y$$. +- **Loss Function:** Gini Impurity or Mean Squared Error +- **Performance Metrics:** Accuracy, MAE +- **Caveats:** Prone to overfitting; sensitive to data changes. + +### 4. **Support Vector Machines (SVM)** +- **Mathematical Formula:** + $$ f(x) = w^T x + b $$ + where $$w$$ is the weight vector and $$b$$ is the bias. +- **Loss Function:** Hinge Loss + $$ L(y, f(x)) = \max(0, 1 - y f(x)) $$ +- **Performance Metrics:** Accuracy, Precision, Recall +- **Caveats:** Computationally expensive for large datasets; requires careful tuning of hyperparameters. + +### 5. **Random Forest** +- **Mathematical Formula:** + The prediction is made by averaging the predictions from multiple decision trees: + $$ \hat{y} = \frac{1}{N} \sum_{i=1}^{N} T_i(x) $$ + where $$T_i$$ are individual trees. +- **Loss Function:** Mean Squared Error or Gini Impurity +- **Performance Metrics:** Out-of-Bag Error, Accuracy +- **Caveats:** Less interpretable than single trees; requires significant computational resources. + +### 6. **Gradient Boosting Machines (GBM)** +- **Mathematical Formula:** + $$ F(x) = F_{m-1}(x) + \gamma_m h_m(x) $$ + where $$h_m(x)$$ is the new tree added at iteration $$m$$. +- **Loss Function:** Log Loss or Mean Squared Error +- **Performance Metrics:** RMSE +- **Caveats:** Sensitive to overfitting; requires careful tuning of learning rate and tree depth. + +### 7. **Neural Networks** +- **Mathematical Formula:** + $$ y = f(WX + b) $$ + where $$W$$ are weights, $$X$$ is input data, and $$b$$ is bias. +- **Loss Function:** Cross-Entropy Loss or Mean Squared Error + - Cross-Entropy for classification: + $$ L = -\frac{1}{n} \sum_{i=1}^{n} [y_i \log(\hat{y}_i)] $$ +- **Performance Metrics:** Accuracy, F1 Score, AUC +- **Caveats:** Requires large amounts of data; less interpretable than traditional models. + +### 8. **K-Means Clustering** +- **Mathematical Formula:** +$$ J = \sum_{i=1}^{k} \sum_{j=1}^{n} ||x_j^{(i)} - c_i||^2 $$ +where $$c_i$$ are centroids and $$x_j^{(i)}$$ are data points assigned to cluster $$i$$. +- **Loss Function:** Sum of Squared Errors (SSE) +- **Performance Metrics:** Silhouette Score, Davies-Bouldin Index +- **Caveats:** Assumes spherical clusters; sensitive to initial centroid placement. + +## Summary of Formulations + +| Algorithm | Mathematical Formula | Loss Function | Performance Metrics | +|------------------------|-------------------------------------------------------------------------------------|------------------------------------|----------------------------------------| +| Linear Regression | $$ y = \beta_0 + \beta_1 x_1 + ... + \beta_n x_n + \epsilon $$ | MSE | R2, MAE | +| Logistic Regression | $$ P(Y=1|X) = \frac{1}{1 + e^{-(\beta_0 + ... + \beta_n x_n)}} $$ | Binary Cross-Entropy | Accuracy, F1 Score | +| Decision Trees | Gini: $$ Gini(D) = 1 - \sum p_j^2 $$ | Gini Impurity / MSE | Accuracy, MAE | +| Support Vector Machines | $$ f(x) = w^T x + b $$ | Hinge Loss | Accuracy, Precision | +| Random Forest | $$ \hat{y} = \frac{1}{N} \sum T_i(x) $$ | MSE / Gini Impurity | Out-of-Bag Error | +| Gradient Boosting | $$ F(x) = F_{m-1}(x) + h_m(x) $$ | Log Loss / MSE | RMSE | +| Neural Networks | $$ y = f(WX + b) $$ | Cross-Entropy / MSE | Accuracy, AUC | +| K-Means Clustering | $$ J = \sum ||x_j^{(i)} - c_i||^2 $$ | SSE | Silhouette Score | + +This comprehensive overview provides insights into each algorithm's mathematical foundation along with its practical applications and limitations. Understanding these aspects can help in selecting the right algorithm for specific data science tasks. + +Citations: +[1] https://www.kdnuggets.com/2020/01/decision-tree-algorithm-explained.html +[2] http://fiascodata.blogspot.com/2018/08/decision-tree-mathematical-formulation.html +[3] https://en.wikipedia.org/wiki/Decision_tree_learning +[4] https://www.datascienceprophet.com/understanding-the-mathematics-behind-the-decision-tree-algorithm-part-i/ +[5] https://towardsdatascience.com/the-mathematics-of-decision-trees-random-forest-and-feature-importance-in-scikit-learn-and-spark-f2861df67e3?gi=36fa533e014a +[6] https://www.datacamp.com/tutorial/loss-function-in-machine-learning +[7] https://neptune.ai/blog/performance-metrics-in-machine-learning-complete-guide +[8] https://towardsdatascience.com/estimators-loss-functions-optimizers-core-of-ml-algorithms-d603f6b0161a?gi=5432fa9d3888 From af033d9ac6348fa3489512d17ad9ac14914c9679 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年10月15日 10:20:15 +0530 Subject: [PATCH 30/38] Create gamblers_ruin.py --- gamblers_ruin.py | 77 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 gamblers_ruin.py diff --git a/gamblers_ruin.py b/gamblers_ruin.py new file mode 100644 index 0000000..4dc0381 --- /dev/null +++ b/gamblers_ruin.py @@ -0,0 +1,77 @@ +""" +David vs. Goliath Gambler's Ruin Simulation + +This program simulates a gambling scenario between two players: David and Goliath. +David has a skill advantage, represented by a 55% probability of winning each round, +while Goliath has a size advantage with a larger initial amount of money. + +Assumptions: +- David starts with 2,000,ドル and Goliath starts with 10,000ドル. +- Each round of betting results in a transfer of 1,000ドル from the loser to the winner. +- The game continues until one player runs out of money (i.e., their amount reaches zero). +- The outcome of each round is determined by a random number generator, reflecting David's skill advantage. + +Mathematics: +- The simulation models a stochastic process where each round can be viewed as an independent Bernoulli trial: + - David wins with a probability of 0.55. + - Goliath wins with a probability of 0.45. +- The expected outcomes can be analyzed using concepts from probability theory and stochastic processes. +- The simulation runs for a specified number of trials to gather statistical data on how often David wins compared to Goliath. + +Usage: +1. Run the program in a Python environment. +2. Input the desired number of simulations when prompted. +3. The program will output the number of wins for both David and Goliath and display a bar chart of the results. + +This simulation provides insights into how skill can offset size advantages in competitive scenarios. +""" + +import random +import matplotlib.pyplot as plt + +def gambler_ruin(david_initial, goliath_initial, david_win_prob, simulations): + results = [] + + for _ in range(simulations): + david_amount = david_initial + goliath_amount = goliath_initial + + while david_amount> 0 and goliath_amount> 0: + # Simulate a single bet based on David's winning probability + if random.random() < david_win_prob: # David wins + david_amount += 1000 + goliath_amount -= 1000 + else: # Goliath wins + david_amount -= 1000 + goliath_amount += 1000 + + # Record the result: True if David wins, False if Goliath wins + results.append(david_amount> 0) + + return results + +def plot_results(results): + wins = sum(results) + losses = len(results) - wins + + plt.bar(['David Wins', 'Goliath Wins'], [wins, losses], color=['blue', 'red']) + plt.title('David vs. Goliath Simulation Results') + plt.ylabel('Number of Simulations') + plt.show() + +def main(): + david_initial = 2000 # David's initial amount + goliath_initial = 10000 # Goliath's initial amount + david_win_prob = 0.51 # David's skill advantage (55%) + simulations = int(input("Enter number of simulations: ")) + + results = gambler_ruin(david_initial, goliath_initial, david_win_prob, simulations) + + print(f"\nResults after {simulations} simulations:") + print(f"David Wins: {sum(results)}") + print(f"Goliath Wins: {len(results) - sum(results)}") + + plot_results(results) + +if __name__ == "__main__": + main() From 472442deab19f9a17b0a8480347c0704d36ba64a Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年11月27日 09:37:17 +0530 Subject: [PATCH 31/38] Create GeneralMLPrep.md --- DataScience/GeneralMLPrep.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 DataScience/GeneralMLPrep.md diff --git a/DataScience/GeneralMLPrep.md b/DataScience/GeneralMLPrep.md new file mode 100644 index 0000000..7bcae6e --- /dev/null +++ b/DataScience/GeneralMLPrep.md @@ -0,0 +1,20 @@ +CNN +========== +* CNN are deep learning architectures that are primarily used for processing image data. +* The special operation known as Convolution helps them extract features like edges and textures, in combination with filters. +* ReLU is applied as a activation function to add non linearity +* Pooling is perfomed to reduce the spatial dimensions while retainign important information. This is helpful in computational load and controlling overfitting +* Fully Connected Layer (FCL) , after several convolution and pooling operations, the output is passes through a FCL to generate class probabilties needed for classification. + +How CNNs Work: +========== +* The input image is transformed into a numerical representation, where each pixel is assigned a value based on its intensity. +* The convolution operation involves sliding the filter across the image and performing element-wise multiplication, followed by summation to create a feature map. +* As data progresses through multiple layers, CNNs learn increasingly complex features, from simple edges in early layers to intricate shapes in deeper layers. + +Applications: +========== +CNNs are widely used in various fields such as: +* Image Recognition: Identifying objects in images (e.g., facial recognition). +* Medical Image Analysis: Analyzing X-rays or MRIs for diagnostic purposes. +* Autonomous Vehicles: Object detection and scene understanding. From 06dbd0e2b922093dd9e70e9c3c7cffe68c88d5ff Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年11月27日 09:45:54 +0530 Subject: [PATCH 32/38] Update GeneralMLPrep.md --- DataScience/GeneralMLPrep.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/DataScience/GeneralMLPrep.md b/DataScience/GeneralMLPrep.md index 7bcae6e..6b333d2 100644 --- a/DataScience/GeneralMLPrep.md +++ b/DataScience/GeneralMLPrep.md @@ -18,3 +18,16 @@ CNNs are widely used in various fields such as: * Image Recognition: Identifying objects in images (e.g., facial recognition). * Medical Image Analysis: Analyzing X-rays or MRIs for diagnostic purposes. * Autonomous Vehicles: Object detection and scene understanding. + +RNN +========== +* RNN are a class of nerual networks that are excellent at processing sequential data +* They maintain an internal state at time step `t` for input `x(t)`, and combines it with hidden state from the previous step `h(t-1)` to produce a new hidden state `h(t)` +* h (t) = f [ W(h) * h(t−1) + W(x) * x(t) + b], W(h) and W(x) are weight matrics and b is the bias term, f is the activation function + +Applications: +========== +RNNs are commonly used in: +* Natural Language Processing: Tasks such as language modeling, text generation, and sentiment analysis. +* Speech Recognition: Processing audio signals to convert speech into text. +* Time Series Prediction: Forecasting stock prices or weather conditions based on historical data. From a61e1bc4094051ab1093aba649f64e2ecb90cf9b Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年11月27日 09:53:22 +0530 Subject: [PATCH 33/38] Update GeneralMLPrep.md --- DataScience/GeneralMLPrep.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/DataScience/GeneralMLPrep.md b/DataScience/GeneralMLPrep.md index 6b333d2..e810021 100644 --- a/DataScience/GeneralMLPrep.md +++ b/DataScience/GeneralMLPrep.md @@ -31,3 +31,30 @@ RNNs are commonly used in: * Natural Language Processing: Tasks such as language modeling, text generation, and sentiment analysis. * Speech Recognition: Processing audio signals to convert speech into text. * Time Series Prediction: Forecasting stock prices or weather conditions based on historical data. + +Decision Tree +========== +* Decision tree is a supervised ML algorithm used in classification and regression taks +* It is able to model decision and possible consequences in the form of a tree like strcuture +* The branch represents a `decision rule` and the internal node represents a `feature`. The leaf node or the terminal node of the branch is the `outcome` + +Building a Decision Tree: +========== +DEINR (pronounced as "Diner") : Data; Entropy; InformationGain ; NodeSeletion; RecursiveSplitting +* Data Input: Start with the entire dataset. +* Entropy Calculation: Calculate the entropy of the target variable and predictor attributes to measure impurity. +* Information Gain: Determine the information gain for each attribute to identify which feature best splits the data. +* Node Selection: Choose the attribute with the highest information gain as the root node. +* Recursive Splitting: Repeat this process recursively for each branch until all branches are finalized or a stopping criterion is met (e.g., maximum depth or minimum samples per leaf) + +Advantages: +========== +* Easy to interpret and visualize. +* Requires little data preprocessing (no need for normalization). +* Can handle both numerical and categorical data. + +Disadvantages: +============ +* Prone to overfitting, especially with deep trees. +* Sensitive to small variations in data. + From c6515d89301327195ddbc5245fd4a9e6302edc67 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年11月27日 09:55:01 +0530 Subject: [PATCH 34/38] Update GeneralMLPrep.md --- DataScience/GeneralMLPrep.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DataScience/GeneralMLPrep.md b/DataScience/GeneralMLPrep.md index e810021..15669da 100644 --- a/DataScience/GeneralMLPrep.md +++ b/DataScience/GeneralMLPrep.md @@ -45,7 +45,7 @@ DEINR (pronounced as "Diner") : Data; Entropy; InformationGain ; NodeSeletion; R * Entropy Calculation: Calculate the entropy of the target variable and predictor attributes to measure impurity. * Information Gain: Determine the information gain for each attribute to identify which feature best splits the data. * Node Selection: Choose the attribute with the highest information gain as the root node. -* Recursive Splitting: Repeat this process recursively for each branch until all branches are finalized or a stopping criterion is met (e.g., maximum depth or minimum samples per leaf) +* Recursive Splitting: Repeat this process recursively for each branch until all branches are finalized or a *stopping criterion is met (e.g., maximum depth or minimum samples per leaf)* Advantages: ========== From 2f9aa21d25efc41b4734ebdab1c34fab81c97bcd Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年11月27日 10:09:03 +0530 Subject: [PATCH 35/38] Update GeneralMLPrep.md --- DataScience/GeneralMLPrep.md | 42 ++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/DataScience/GeneralMLPrep.md b/DataScience/GeneralMLPrep.md index 15669da..0ef9341 100644 --- a/DataScience/GeneralMLPrep.md +++ b/DataScience/GeneralMLPrep.md @@ -58,3 +58,45 @@ Disadvantages: * Prone to overfitting, especially with deep trees. * Sensitive to small variations in data. +Random Forest +========== +* Random Forest is an ensermble technique, that combines multiple decision trees +* It mitigates overfitting by averaging the results of many tree, which indivudually may have high variance + +Building a Random Forest: +========== +BTA (pronounced as "beta"): BootStrapSampling; TreeConstruction; Aggregation +* Bootstrap Sampling: Randomly select subsets of the training data with replacement to create multiple datasets. +* Tree Construction: For each subset, build a decision tree using a random selection of features at each split. +* Aggregation: During prediction, aggregate the results from all trees (e.g., majority vote for classification or average for regression) + +Advantages: +========== +* Reduces overfitting compared to individual decision trees. +* Handles large datasets with higher dimensionality well. +* Provides feature importance scores. + +Disadvantages: +========== +* More complex and less interpretable than single decision trees. +* Requires more computational resources. + +Bagging or (B)ootstrap (Agg)regating +==================================== +* This is an ensemble technique aimed at improving the accuracy and stability of ML models +* It is done by combining multiple models trained on different subsets of the training data + +How Bagging Works: +=============== +* Multiple Samples: Generate multiple bootstrap samples from the original dataset. +* Model Training: Train a separate model (e.g., decision tree) on each bootstrap sample. +* Final Prediction: Aggregate predictions from all models (e.g., majority voting for classification) + +Advantages: +========== +* Reduces variance and helps prevent overfitting. +* Improves model robustness against noise in data. + +Disadvantages: +================= +* May not significantly improve performance if base learners are not diverse. From 5c1272a9b14025505c2b665ac5493fe7bd195e3f Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年11月27日 10:12:43 +0530 Subject: [PATCH 36/38] Update GeneralMLPrep.md --- DataScience/GeneralMLPrep.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DataScience/GeneralMLPrep.md b/DataScience/GeneralMLPrep.md index 0ef9341..f61a810 100644 --- a/DataScience/GeneralMLPrep.md +++ b/DataScience/GeneralMLPrep.md @@ -90,7 +90,7 @@ How Bagging Works: =============== * Multiple Samples: Generate multiple bootstrap samples from the original dataset. * Model Training: Train a separate model (e.g., decision tree) on each bootstrap sample. -* Final Prediction: Aggregate predictions from all models (e.g., majority voting for classification) +* Final Prediction: Aggregate predictions from all models (e.g., majority voting for classification, averaging for regression) Advantages: ========== From 1e36fb481f9caff0e8f08562d6a401075a8e738b Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年11月27日 10:16:37 +0530 Subject: [PATCH 37/38] Update GeneralMLPrep.md --- DataScience/GeneralMLPrep.md | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/DataScience/GeneralMLPrep.md b/DataScience/GeneralMLPrep.md index f61a810..e8c4f59 100644 --- a/DataScience/GeneralMLPrep.md +++ b/DataScience/GeneralMLPrep.md @@ -99,4 +99,32 @@ Advantages: Disadvantages: ================= -* May not significantly improve performance if base learners are not diverse. +* May not significantly improve performance if base learners are not diverse. + +Boosting +==================================== +* This is an ensemble technique aimed at improving the accuracy and stability of ML models +* It is done by combining weak learners(models that perfrom slightly better than random chance) to create a strong learner +* The strong learner is built in iterations with focus on misclassified instances. + +How Boosting Works: +=============== +* Sequential Learning: Models are trained sequentially, where each new model focuses on correcting errors made by previous models. +* Weight Adjustment: Misclassified instances are given higher weights so that subsequent models pay more attention to them. +* Final Prediction: Combine predictions from all models, typically using weighted voting or averaging + +Popular Boosting Algorithms: +========== +* AdaBoost +* Gradient Boosting +* XGBoost + +Advantages: +========== +* Often achieves high accuracy and performs well even with limited data. +* Can handle various types of data and relationships. + +Disadvantages: +================= +* More prone to overfitting than bagging if not carefully tuned. +* Requires careful tuning of parameters. From 5e2189c50841bb9ed54807986b8dd010590fd9b5 Mon Sep 17 00:00:00 2001 From: Amogh Singhal Date: 2024年11月27日 10:46:08 +0530 Subject: [PATCH 38/38] Create LLMPrep.md --- DataScience/LLMPrep.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 DataScience/LLMPrep.md diff --git a/DataScience/LLMPrep.md b/DataScience/LLMPrep.md new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/DataScience/LLMPrep.md @@ -0,0 +1 @@ +

AltStyle によって変換されたページ (->オリジナル) /