I am working on this function here and it produces the desired output. I just want to make sure I'm going about things in a smart way.
It effectively just sorts through the data, with consideration to the fact that campaigns may be created by different firms. Then it aggregates the data.
from collections import defaultdict
def aggregate_ads(ad_data, labels=list(), default_advertiser='internal'):
# Creates dicts to hold data, structured to provide code-readability
ads_data = defaultdict(
lambda: defaultdict(
lambda: defaultdict(int)
))
# Lowercases all labels
labels = map(str.lower, labels)
# Sorts each instance into its channel and adds
for adgroup in ad_data:
# Cleans and standardizes campaign name
campaign_name = adgroup['campaign']['name']
campaign_name = campaign_name.replace('-', ' ').replace('_', ' ').lower()
# Handling where ad_group type is not provided
if not adgroup['ad_group'].get('type_'):
adgroup['ad_group']['type_'] = 'MIXED'
# Collects channel and metrics
channel = adgroup['ad_group']['type_']
metrics = dict(
impressions= int(adgroup['metrics']['impressions']),
clicks = int(adgroup['metrics']['clicks']),
# Converts cost in microns to usd
cost = round(int(adgroup['metrics']['cost_micros'])/1000000, 2),
)
# Checks for labels in campaign name and defaults to specified default
advertisers = set(labels).intersection(campaign_name.split())
if not advertisers:
advertisers.add(default_advertiser)
# Adds the variables to ads_data
for advertiser in advertisers:
ads_data[advertiser][channel]['impressions'] += metrics['impressions']
ads_data[advertiser][channel]['clicks'] += metrics['clicks']
ads_data[advertiser][channel]['cost'] += metrics['cost']
# Converts into regular dict on return
return dict(
(advertiser, dict((ad_type, dict(metrics))
for ad_type, metrics in ad_data.items()))
for advertiser, ad_data in ads_data.items())
Here's my output:
{'internal': {'DISPLAY_STANDARD': {'clicks': 163,
'cost': 11.8,
'impressions': 6785},
'MIXED': {'clicks': 6, 'cost': 0.1, 'impressions': 434},
'SEARCH_STANDARD': {'clicks': 2,
'cost': 5.89,
'impressions': 151}},
'play': {'MIXED': {'clicks': 5, 'cost': 0.05, 'impressions': 242}}}
and some sample input:
example = [
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Google Play Market-USA/Canada-2022-08',
'start_date': '2022-07-20',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'MIXED'},
'metrics': {'clicks': '5', 'cost_micros': '54238', 'impressions': '242'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Google Play Market-USA/Canada-2022-08',
'start_date': '2022-07-20',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'MIXED'},
'metrics': {'clicks': '3', 'cost_micros': '53943', 'impressions': '217'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Google Play Market-USA/Canada-2022-08',
'start_date': '2022-07-20',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'MIXED'},
'metrics': {'clicks': '3', 'cost_micros': '53943', 'impressions': '217'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Display-Global-Desktop-202208',
'start_date': '2022-07-21',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'DISPLAY_STANDARD'},
'metrics': {'clicks': '95', 'cost_micros': '6036546', 'impressions': '4186'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Search-USA/NOTES',
'start_date': '2022-08-30',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'SEARCH_STANDARD'},
'metrics': {'clicks': '2', 'cost_micros': '5890000', 'impressions': '151'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Display-Global--Desktop-Files',
'start_date': '2022-09-02',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'DISPLAY_STANDARD'},
'metrics': {'clicks': '68', 'cost_micros': '5757098', 'impressions': '2599'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}}
]
labels = ['play'] # In reality, this is partner labels they'd put in the campaign name
aggregate_ads(example, labels)
1 Answer 1
You have a critical bug. You assign a map
to labels
, but this is never materialised and so is consumed and subsequently looks like an empty collection.
I'm going to suggest that you throw away most of your implementation and replace it with Pandas, which is well-suited to your case. The entry point will be putting your example code through pd.json_normalize
, which will give you a data frame that has six rows for your example data.
Replacing your dict generators with Pandas to_dict
, and performing the sum in a vectorised manner instead of in a loop, your code could look like the following:
from pprint import pprint
import pandas as pd
def aggregate_ads(ad_data: dict, labels: set[str], default_advertiser: str = 'internal') -> pd.DataFrame:
df = pd.json_normalize(ad_data).astype({
'metrics.clicks': int,
'metrics.cost_micros': int,
'metrics.impressions': int,
})
df['channel'] = df['ad_group.type_'].fillna('MIXED')
df['metrics.cost'] = df['metrics.cost_micros'] / 1e6
campaign_fragments = (
df['campaign.name']
.str.replace('-', ' ').str.replace('_', ' ').str.lower().str.split()
.apply(set).apply(labels.__and__)
)
campaign_fragments[campaign_fragments == set()] = default_advertiser
df['advertisers'] = campaign_fragments
return (
df.explode('advertisers')
.groupby(['advertisers', 'channel'])
['metrics.impressions', 'metrics.clicks', 'metrics.cost']
.sum()
.rename(columns={
'metrics.impressions': 'impressions',
'metrics.clicks': 'clicks',
'metrics.cost': 'cost',
})
)
def ads_to_json(groups: pd.DataFrame) -> dict:
inners = groups.groupby(level=0).apply(
lambda df: df.droplevel(0).to_dict('index'))
return inners.to_dict()
def test() -> None:
example = [
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Google Play Market-USA/Canada-2022-08',
'start_date': '2022-07-20',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'MIXED'},
'metrics': {'clicks': '5', 'cost_micros': '54238', 'impressions': '242'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Google Play Market-USA/Canada-2022-08',
'start_date': '2022-07-20',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'MIXED'},
'metrics': {'clicks': '3', 'cost_micros': '53943', 'impressions': '217'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Google Play Market-USA/Canada-2022-08',
'start_date': '2022-07-20',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'MIXED'},
'metrics': {'clicks': '3', 'cost_micros': '53943', 'impressions': '217'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Display-Global-Desktop-202208',
'start_date': '2022-07-21',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'DISPLAY_STANDARD'},
'metrics': {'clicks': '95', 'cost_micros': '6036546', 'impressions': '4186'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Search-USA/NOTES',
'start_date': '2022-08-30',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'SEARCH_STANDARD'},
'metrics': {'clicks': '2', 'cost_micros': '5890000', 'impressions': '151'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Display-Global--Desktop-Files',
'start_date': '2022-09-02',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'DISPLAY_STANDARD'},
'metrics': {'clicks': '68', 'cost_micros': '5757098', 'impressions': '2599'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}}
]
grouped = aggregate_ads(example, labels={'play'})
js = ads_to_json(grouped)
pprint(js)
if __name__ == '__main__':
test()
-
\$\begingroup\$ Thanks for this! You're clearly much more advanced than I am, and I'm excited learn how all of this functions. A quick question though, what is "labels.__and__" ? Specifically, __and__ \$\endgroup\$Guy– Guy2022年10月13日 20:41:08 +00:00Commented Oct 13, 2022 at 20:41
-
1\$\begingroup\$ That's a tricky bit of dunder magic that actually means "use a function binding to the
&
operator, which in this case is set intersection" \$\endgroup\$Reinderien– Reinderien2022年10月13日 22:47:27 +00:00Commented Oct 13, 2022 at 22:47