I have a DataFrame (database_df
) that contains the general record with the IDs that are the same team in each of the lines, containing these values I need to find in two APIs (api_1_df
, api_2_df
) that I collect the data if there is a way to find the same home team and the away team on the same line in both APIs.
When found on the same lines, I need to generate a dataframe with the lines in common with the columns that demonstrate the game's identification code (match_id
):
import pandas as pd
api_1_id = "pressure_id"
api_2_id = "betfair_id"
api_1_match_id = "pressure_match_id"
api_2_match_id = "betfair_match_id"
database_df = pd.DataFrame({
'pressure_id': [101, 102, 103, 201, 202, 203],
'pressure_name': ["Rangers", "City", "Barcelona FC", "Real Madrid FC", "Liverpool", "Chelsea FC"],
'betfair_id': [1001, 1002, 1003, 2001, 2002, 2003],
'betfair_name': ["Rangers FC", "Manchester City", "Barcelona FC", "Real Madrid", "Liverpool FC", "Chelsea"]
})
api_1_df = pd.DataFrame({
'match_id': [1, 3],
'home_id': [101, 103],
'home_name': ["Rangers", "Barcelona FC"],
'away_id': [201, 203],
'away_name': ["Real Madrid FC", "Chelsea FC"]
})
api_2_df = pd.DataFrame({
'match_id': [123, 456, 789],
'home_id': [1001, 1002, 1003],
'home_name': ["Rangers", "City", "Barcelona FC"],
'away_id': [2001, 2002, 2003],
'away_name': ["Real Madrid", "Liverpool FC", "Chelsea"]
})
def api_1_and_api_2_ids(api_1_df: pd.DataFrame, database_df: pd.DataFrame, api_2_df: pd.DataFrame) -> None:
result = []
for _, row in api_1_df.iterrows():
home_id: pd.DataFrame = database_df.loc[database_df[api_1_id] == row['home_id']]
away_id: pd.DataFrame = database_df.loc[database_df[api_1_id] == row['away_id']]
home_id = home_id[api_2_id].iloc[0]
away_id = away_id[api_2_id].iloc[0]
matched_rows: pd.DataFrame = api_2_df[(api_2_df['home_id'] == home_id) & (api_2_df['away_id'] == away_id)]
if not matched_rows.empty:
result.append({api_1_match_id: row['match_id'], api_2_match_id: matched_rows.iloc[0]['match_id']})
result_df = pd.DataFrame(result, columns=[api_1_match_id, api_2_match_id])
result_df = result_df.sort_values(by=[api_1_match_id])
print(result_df)
Result:
pressure_match_id betfair_match_id
1 123
3 789
I would like to improve this method that I am using because it is slow in a large dataframe and I would also like indications of visual improvement because I am finding it very messy and difficult to understand.
1 Answer 1
The loop and iterative concatenation need to go away. This is just a series of merges. You might be able to add some micro-optimizations to make the merges more efficient (making indices, changing the merge order, etc.), but the broad strokes can look like:
import pandas as pd
api_1_id = "pressure_id"
api_2_id = "betfair_id"
api_1_match_id = "pressure_match_id"
api_2_match_id = "betfair_match_id"
def api_1_and_api_2_ids(
api_1_df: pd.DataFrame,
database_df: pd.DataFrame,
api_2_df: pd.DataFrame,
) -> pd.DataFrame:
db_to_1_home = pd.merge(left=api_1_df, right=database_df, left_on='home_id', right_on=api_1_id)
db_to_1_away = pd.merge(left=api_1_df, right=database_df, left_on='away_id', right_on=api_1_id)
db_1_2_home = pd.merge(left=db_to_1_home, right=api_2_df, left_on=api_2_id, right_on='home_id',
suffixes=['_api1', '_api2'])
db_1_2 = pd.merge(left=db_to_1_away, right=db_1_2_home, left_on=api_2_id, right_on='away_id_api2',
suffixes=['_away', '_home'])
result_df = db_1_2[['match_id_api1', 'match_id_api2']].rename(columns={
'match_id_api1': api_1_match_id,
'match_id_api2': api_2_match_id,
})
return result_df
def test() -> None:
database_df = pd.DataFrame({
'pressure_id': [101, 102, 103, 201, 202, 203],
'pressure_name': ["Rangers", "City", "Barcelona FC", "Real Madrid FC", "Liverpool",
"Chelsea FC"],
'betfair_id': [1001, 1002, 1003, 2001, 2002, 2003],
'betfair_name': ["Rangers FC", "Manchester City", "Barcelona FC", "Real Madrid",
"Liverpool FC", "Chelsea"]
})
api_1_df = pd.DataFrame({
'match_id': [1, 3],
'home_id': [101, 103],
'home_name': ["Rangers", "Barcelona FC"],
'away_id': [201, 203],
'away_name': ["Real Madrid FC", "Chelsea FC"]
})
api_2_df = pd.DataFrame({
'match_id': [123, 456, 789],
'home_id': [1001, 1002, 1003],
'home_name': ["Rangers", "City", "Barcelona FC"],
'away_id': [2001, 2002, 2003],
'away_name': ["Real Madrid", "Liverpool FC", "Chelsea"]
})
result_df = api_1_and_api_2_ids(api_1_df, database_df, api_2_df)
print(result_df)
if __name__ == '__main__':
test()
pressure_match_id betfair_match_id
0 1 123
1 3 789