Use row data from a database to find rows in dataframes that match and use data to generate a separate dataframe

Question 1

I have a DataFrame (database_df) that contains the general record with the IDs that are the same team in each of the lines, containing these values I need to find in two APIs (api_1_df, api_2_df) that I collect the data if there is a way to find the same home team and the away team on the same line in both APIs.

When found on the same lines, I need to generate a dataframe with the lines in common with the columns that demonstrate the game's identification code (match_id):

import pandas as pd
api_1_id = "pressure_id"
api_2_id = "betfair_id"
api_1_match_id = "pressure_match_id"
api_2_match_id = "betfair_match_id"
database_df = pd.DataFrame({
 'pressure_id': [101, 102, 103, 201, 202, 203],
 'pressure_name': ["Rangers", "City", "Barcelona FC", "Real Madrid FC", "Liverpool", "Chelsea FC"],
 'betfair_id': [1001, 1002, 1003, 2001, 2002, 2003],
 'betfair_name': ["Rangers FC", "Manchester City", "Barcelona FC", "Real Madrid", "Liverpool FC", "Chelsea"]
})
api_1_df = pd.DataFrame({
 'match_id': [1, 3],
 'home_id': [101, 103],
 'home_name': ["Rangers", "Barcelona FC"],
 'away_id': [201, 203],
 'away_name': ["Real Madrid FC", "Chelsea FC"]
})
api_2_df = pd.DataFrame({
 'match_id': [123, 456, 789],
 'home_id': [1001, 1002, 1003],
 'home_name': ["Rangers", "City", "Barcelona FC"],
 'away_id': [2001, 2002, 2003],
 'away_name': ["Real Madrid", "Liverpool FC", "Chelsea"]
})
def api_1_and_api_2_ids(api_1_df: pd.DataFrame, database_df: pd.DataFrame, api_2_df: pd.DataFrame) -> None:
 result = []
 for _, row in api_1_df.iterrows():
 home_id: pd.DataFrame = database_df.loc[database_df[api_1_id] == row['home_id']]
 away_id: pd.DataFrame = database_df.loc[database_df[api_1_id] == row['away_id']]
 home_id = home_id[api_2_id].iloc[0]
 away_id = away_id[api_2_id].iloc[0]
 matched_rows: pd.DataFrame = api_2_df[(api_2_df['home_id'] == home_id) & (api_2_df['away_id'] == away_id)]
 if not matched_rows.empty:
 result.append({api_1_match_id: row['match_id'], api_2_match_id: matched_rows.iloc[0]['match_id']})
 result_df = pd.DataFrame(result, columns=[api_1_match_id, api_2_match_id])
 result_df = result_df.sort_values(by=[api_1_match_id])
 print(result_df)

Result:

pressure_match_id betfair_match_id
 1 123
 3 789

I would like to improve this method that I am using because it is slow in a large dataframe and I would also like indications of visual improvement because I am finding it very messy and difficult to understand.

Question 2

The loop and iterative concatenation need to go away. This is just a series of merges. You might be able to add some micro-optimizations to make the merges more efficient (making indices, changing the merge order, etc.), but the broad strokes can look like:

import pandas as pd
api_1_id = "pressure_id"
api_2_id = "betfair_id"
api_1_match_id = "pressure_match_id"
api_2_match_id = "betfair_match_id"
def api_1_and_api_2_ids(
 api_1_df: pd.DataFrame,
 database_df: pd.DataFrame,
 api_2_df: pd.DataFrame,
) -> pd.DataFrame:
 db_to_1_home = pd.merge(left=api_1_df, right=database_df, left_on='home_id', right_on=api_1_id)
 db_to_1_away = pd.merge(left=api_1_df, right=database_df, left_on='away_id', right_on=api_1_id)
 db_1_2_home = pd.merge(left=db_to_1_home, right=api_2_df, left_on=api_2_id, right_on='home_id',
 suffixes=['_api1', '_api2'])
 db_1_2 = pd.merge(left=db_to_1_away, right=db_1_2_home, left_on=api_2_id, right_on='away_id_api2',
 suffixes=['_away', '_home'])
 result_df = db_1_2[['match_id_api1', 'match_id_api2']].rename(columns={
 'match_id_api1': api_1_match_id,
 'match_id_api2': api_2_match_id,
 })
 return result_df
def test() -> None:
 database_df = pd.DataFrame({
 'pressure_id': [101, 102, 103, 201, 202, 203],
 'pressure_name': ["Rangers", "City", "Barcelona FC", "Real Madrid FC", "Liverpool",
 "Chelsea FC"],
 'betfair_id': [1001, 1002, 1003, 2001, 2002, 2003],
 'betfair_name': ["Rangers FC", "Manchester City", "Barcelona FC", "Real Madrid",
 "Liverpool FC", "Chelsea"]
 })
 api_1_df = pd.DataFrame({
 'match_id': [1, 3],
 'home_id': [101, 103],
 'home_name': ["Rangers", "Barcelona FC"],
 'away_id': [201, 203],
 'away_name': ["Real Madrid FC", "Chelsea FC"]
 })
 api_2_df = pd.DataFrame({
 'match_id': [123, 456, 789],
 'home_id': [1001, 1002, 1003],
 'home_name': ["Rangers", "City", "Barcelona FC"],
 'away_id': [2001, 2002, 2003],
 'away_name': ["Real Madrid", "Liverpool FC", "Chelsea"]
 })
 result_df = api_1_and_api_2_ids(api_1_df, database_df, api_2_df)
 print(result_df)
if __name__ == '__main__':
 test()

 pressure_match_id betfair_match_id
0 1 123
1 3 789

score 3 · Accepted Answer · 2023-09-07 03:15:42Z

The loop and iterative concatenation need to go away. This is just a series of merges. You might be able to add some micro-optimizations to make the merges more efficient (making indices, changing the merge order, etc.), but the broad strokes can look like:

import pandas as pd
api_1_id = "pressure_id"
api_2_id = "betfair_id"
api_1_match_id = "pressure_match_id"
api_2_match_id = "betfair_match_id"
def api_1_and_api_2_ids(
 api_1_df: pd.DataFrame,
 database_df: pd.DataFrame,
 api_2_df: pd.DataFrame,
) -> pd.DataFrame:
 db_to_1_home = pd.merge(left=api_1_df, right=database_df, left_on='home_id', right_on=api_1_id)
 db_to_1_away = pd.merge(left=api_1_df, right=database_df, left_on='away_id', right_on=api_1_id)
 db_1_2_home = pd.merge(left=db_to_1_home, right=api_2_df, left_on=api_2_id, right_on='home_id',
 suffixes=['_api1', '_api2'])
 db_1_2 = pd.merge(left=db_to_1_away, right=db_1_2_home, left_on=api_2_id, right_on='away_id_api2',
 suffixes=['_away', '_home'])
 result_df = db_1_2[['match_id_api1', 'match_id_api2']].rename(columns={
 'match_id_api1': api_1_match_id,
 'match_id_api2': api_2_match_id,
 })
 return result_df
def test() -> None:
 database_df = pd.DataFrame({
 'pressure_id': [101, 102, 103, 201, 202, 203],
 'pressure_name': ["Rangers", "City", "Barcelona FC", "Real Madrid FC", "Liverpool",
 "Chelsea FC"],
 'betfair_id': [1001, 1002, 1003, 2001, 2002, 2003],
 'betfair_name': ["Rangers FC", "Manchester City", "Barcelona FC", "Real Madrid",
 "Liverpool FC", "Chelsea"]
 })
 api_1_df = pd.DataFrame({
 'match_id': [1, 3],
 'home_id': [101, 103],
 'home_name': ["Rangers", "Barcelona FC"],
 'away_id': [201, 203],
 'away_name': ["Real Madrid FC", "Chelsea FC"]
 })
 api_2_df = pd.DataFrame({
 'match_id': [123, 456, 789],
 'home_id': [1001, 1002, 1003],
 'home_name': ["Rangers", "City", "Barcelona FC"],
 'away_id': [2001, 2002, 2003],
 'away_name': ["Real Madrid", "Liverpool FC", "Chelsea"]
 })
 result_df = api_1_and_api_2_ids(api_1_df, database_df, api_2_df)
 print(result_df)
if __name__ == '__main__':
 test()

 pressure_match_id betfair_match_id
0 1 123
1 3 789

Stack Exchange Network

Use row data from a database to find rows in dataframes that match and use data to generate a separate dataframe

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions

Use row data from a database to find rows in dataframes that match and use data to generate a separate dataframe

1 Answer 1

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Related

Hot Network Questions