Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit c195ba1

Browse files
finish pipeline for contest problems and tags
1 parent 25c0efb commit c195ba1

File tree

6 files changed

+256
-1
lines changed

6 files changed

+256
-1
lines changed

‎dags/contest_problems_dag.py‎

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import os
2+
import sys
3+
from datetime import datetime
4+
5+
from airflow import DAG
6+
from airflow.operators.python import PythonOperator
7+
8+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Fix ModuleNotFoundError
9+
10+
from operators.contest_problems_ops import etl_contest_problems, etl_problem_tags
11+
12+
default_args = {
13+
"owner": "minhduc29",
14+
"depends_on_past": False,
15+
"start_date": datetime(2025, 1, 17)
16+
}
17+
18+
# Initialize DAG
19+
dag = DAG(
20+
"contest_problems_pipeline",
21+
default_args=default_args,
22+
schedule_interval=None,
23+
catchup=False
24+
)
25+
26+
# Get all problems from past contests
27+
etl_problems = PythonOperator(
28+
task_id="etl_contest_problems",
29+
python_callable=etl_contest_problems,
30+
op_args=[1],
31+
dag=dag
32+
)
33+
34+
# Get all tags for each problem
35+
etl_tags = PythonOperator(
36+
task_id="etl_problem_tags",
37+
python_callable=etl_problem_tags,
38+
dag=dag
39+
)
40+
41+
etl_problems >> etl_tags

‎operators/contest_problems_ops.py‎

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import pandas as pd
2+
import requests
3+
from collections import defaultdict
4+
5+
from utils.queries import contest_problems_query, problem_tags_query
6+
from utils.constants import URL, OUTPUT_PATH, BUCKET_NAME
7+
8+
9+
def etl_contest_problems(num_pages):
10+
"""Extracts, transforms, and loads contests' problems in all pages"""
11+
responses = []
12+
for i in range(num_pages):
13+
# Get response for each page
14+
response = requests.post(URL, json=contest_problems_query(i + 1)).json()["data"]["pastContests"]["data"]
15+
for contest in response:
16+
responses.extend(parse_contest_problems(contest)) # Transform response data to optimized format
17+
# output_path = f"{OUTPUT_PATH}/raw/contest_problems.csv"
18+
output_path = f"s3://{BUCKET_NAME}/raw/contest_problems.csv"
19+
pd.DataFrame(responses).to_csv(output_path, index=False) # Load the data to the destination storage
20+
return output_path
21+
22+
23+
def parse_contest_problems(contest_data):
24+
"""Parses the contest data in custom format"""
25+
return [
26+
{
27+
"contest": contest_data["titleSlug"],
28+
"problem": problem["titleSlug"]
29+
} for problem in contest_data["questions"]
30+
]
31+
32+
33+
def etl_problem_tags(task_instance):
34+
"""Extracts, transforms, and loads each problem's tags"""
35+
input_path = task_instance.xcom_pull(task_ids="etl_contest_problems")
36+
df = pd.read_csv(input_path)
37+
38+
counter = defaultdict(int) # Count the number of each topic tag showing up
39+
responses = []
40+
for problem in df["problem"]:
41+
# Get data for each problem
42+
response = requests.post(URL, json=problem_tags_query(problem)).json()["data"]["question"]
43+
tags = parse_problem_tags(response) # Transform data to get the list of tags
44+
responses.append({
45+
"problem": problem,
46+
"tags": tags
47+
})
48+
for tag in tags:
49+
counter[tag] += 1
50+
51+
# Load tags data to the destination storage
52+
# output_path = f"{OUTPUT_PATH}/raw/problem_tags.csv"
53+
output_path = f"s3://{BUCKET_NAME}/raw/problem_tags.csv"
54+
pd.DataFrame(responses).to_csv(output_path, index=False)
55+
56+
# Load tags counter data to the destination storage
57+
# counter_path = f"{OUTPUT_PATH}/processed/tags_counter.csv"
58+
counter_path = f"s3://{BUCKET_NAME}/processed/tags_counter.csv"
59+
pd.Series(counter).to_csv(counter_path, header=False)
60+
61+
return output_path
62+
63+
64+
def parse_problem_tags(problem_data):
65+
"""Parses the problem data to get list of topic tags"""
66+
return [tag["name"] for tag in problem_data["topicTags"]]
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
Array,28
2+
Matrix,4
3+
Simulation,3
4+
Dynamic Programming,8
5+
Binary Search,5
6+
Depth-First Search,5
7+
Breadth-First Search,4
8+
Graph,3
9+
Shortest Path,2
10+
Two Pointers,2
11+
Stack,2
12+
Segment Tree,4
13+
Queue,1
14+
Monotonic Stack,1
15+
Monotonic Queue,1
16+
Math,9
17+
Sliding Window,2
18+
Enumeration,7
19+
Number Theory,3
20+
Hash Table,10
21+
String,6
22+
Greedy,3
23+
Sorting,6
24+
Prefix Sum,2
25+
String Matching,1
26+
Design,1
27+
Heap (Priority Queue),2
28+
Ordered Set,1
29+
Combinatorics,2
30+
Bit Manipulation,3
31+
Counting,2
32+
Binary Indexed Tree,2
33+
Geometry,2
34+
Backtracking,1
35+
Bitmask,1
36+
Union Find,1
37+
Tree,2

‎sample_data/raw/contest_problems.csv‎

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
contest,problem
2+
weekly-contest-432,zigzag-grid-traversal-with-skip
3+
weekly-contest-432,maximum-amount-of-money-robot-can-earn
4+
weekly-contest-432,minimize-the-maximum-edge-weight-of-graph
5+
weekly-contest-432,count-non-decreasing-subarrays-after-k-operations
6+
weekly-contest-431,maximum-subarray-with-equal-products
7+
weekly-contest-431,find-mirror-score-of-a-string
8+
weekly-contest-431,maximum-coins-from-k-consecutive-bags
9+
weekly-contest-431,maximum-score-of-non-overlapping-intervals
10+
biweekly-contest-147,substring-matching-pattern
11+
biweekly-contest-147,design-task-manager
12+
biweekly-contest-147,longest-subsequence-with-decreasing-adjacent-difference
13+
biweekly-contest-147,maximize-subarray-sum-after-removing-all-occurrences-of-one-element
14+
weekly-contest-430,minimum-operations-to-make-columns-strictly-increasing
15+
weekly-contest-430,find-the-lexicographically-largest-string-from-the-box-i
16+
weekly-contest-430,count-special-subsequences
17+
weekly-contest-430,count-the-number-of-arrays-with-k-matching-adjacent-elements
18+
weekly-contest-429,minimum-number-of-operations-to-make-elements-in-array-distinct
19+
weekly-contest-429,maximum-number-of-distinct-elements-after-operations
20+
weekly-contest-429,smallest-substring-with-identical-characters-i
21+
weekly-contest-429,smallest-substring-with-identical-characters-ii
22+
biweekly-contest-146,count-subarrays-of-length-three-with-a-condition
23+
biweekly-contest-146,count-paths-with-the-given-xor-value
24+
biweekly-contest-146,check-if-grid-can-be-cut-into-sections
25+
biweekly-contest-146,subsequences-with-a-unique-middle-mode-i
26+
weekly-contest-428,button-with-longest-push-time
27+
weekly-contest-428,maximize-amount-after-two-days-of-conversions
28+
weekly-contest-428,count-beautiful-splits-in-an-array
29+
weekly-contest-428,minimum-operations-to-make-character-frequencies-equal
30+
weekly-contest-427,transformed-array
31+
weekly-contest-427,maximum-area-rectangle-with-point-constraints-i
32+
weekly-contest-427,maximum-subarray-sum-with-length-divisible-by-k
33+
weekly-contest-427,maximum-area-rectangle-with-point-constraints-ii
34+
biweekly-contest-145,minimum-operations-to-make-array-values-equal-to-k
35+
biweekly-contest-145,minimum-time-to-break-locks-i
36+
biweekly-contest-145,digit-operations-to-make-two-integers-equal
37+
biweekly-contest-145,count-connected-components-in-lcm-graph
38+
weekly-contest-426,smallest-number-with-all-set-bits
39+
weekly-contest-426,identify-the-largest-outlier-in-an-array
40+
weekly-contest-426,maximize-the-number-of-target-nodes-after-connecting-trees-i
41+
weekly-contest-426,maximize-the-number-of-target-nodes-after-connecting-trees-ii

‎sample_data/raw/problem_tags.csv‎

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
problem,tags
2+
zigzag-grid-traversal-with-skip,"['Array', 'Matrix', 'Simulation']"
3+
maximum-amount-of-money-robot-can-earn,"['Array', 'Dynamic Programming', 'Matrix']"
4+
minimize-the-maximum-edge-weight-of-graph,"['Binary Search', 'Depth-First Search', 'Breadth-First Search', 'Graph', 'Shortest Path']"
5+
count-non-decreasing-subarrays-after-k-operations,"['Array', 'Two Pointers', 'Stack', 'Segment Tree', 'Queue', 'Monotonic Stack', 'Monotonic Queue']"
6+
maximum-subarray-with-equal-products,"['Array', 'Math', 'Sliding Window', 'Enumeration', 'Number Theory']"
7+
find-mirror-score-of-a-string,"['Hash Table', 'String', 'Stack', 'Simulation']"
8+
maximum-coins-from-k-consecutive-bags,"['Array', 'Binary Search', 'Greedy', 'Sliding Window', 'Sorting', 'Prefix Sum']"
9+
maximum-score-of-non-overlapping-intervals,"['Array', 'Binary Search', 'Dynamic Programming', 'Sorting']"
10+
substring-matching-pattern,"['String', 'String Matching']"
11+
design-task-manager,"['Hash Table', 'Design', 'Heap (Priority Queue)', 'Ordered Set']"
12+
longest-subsequence-with-decreasing-adjacent-difference,"['Array', 'Dynamic Programming']"
13+
maximize-subarray-sum-after-removing-all-occurrences-of-one-element,"['Array', 'Dynamic Programming', 'Segment Tree']"
14+
minimum-operations-to-make-columns-strictly-increasing,"['Array', 'Greedy', 'Matrix']"
15+
find-the-lexicographically-largest-string-from-the-box-i,"['Two Pointers', 'String', 'Enumeration']"
16+
count-special-subsequences,"['Array', 'Hash Table', 'Math', 'Enumeration']"
17+
count-the-number-of-arrays-with-k-matching-adjacent-elements,"['Math', 'Combinatorics']"
18+
minimum-number-of-operations-to-make-elements-in-array-distinct,"['Array', 'Hash Table']"
19+
maximum-number-of-distinct-elements-after-operations,"['Array', 'Greedy', 'Sorting']"
20+
smallest-substring-with-identical-characters-i,"['Array', 'Binary Search', 'Enumeration']"
21+
smallest-substring-with-identical-characters-ii,"['String', 'Binary Search']"
22+
count-subarrays-of-length-three-with-a-condition,['Array']
23+
count-paths-with-the-given-xor-value,"['Array', 'Dynamic Programming', 'Bit Manipulation', 'Matrix']"
24+
check-if-grid-can-be-cut-into-sections,"['Array', 'Sorting']"
25+
subsequences-with-a-unique-middle-mode-i,"['Array', 'Hash Table', 'Math', 'Combinatorics']"
26+
button-with-longest-push-time,['Array']
27+
maximize-amount-after-two-days-of-conversions,"['Array', 'String', 'Depth-First Search', 'Breadth-First Search', 'Graph']"
28+
count-beautiful-splits-in-an-array,"['Array', 'Dynamic Programming']"
29+
minimum-operations-to-make-character-frequencies-equal,"['Hash Table', 'String', 'Dynamic Programming', 'Counting', 'Enumeration']"
30+
transformed-array,"['Array', 'Simulation']"
31+
maximum-area-rectangle-with-point-constraints-i,"['Array', 'Math', 'Binary Indexed Tree', 'Segment Tree', 'Geometry', 'Sorting', 'Enumeration']"
32+
maximum-subarray-sum-with-length-divisible-by-k,"['Array', 'Hash Table', 'Prefix Sum']"
33+
maximum-area-rectangle-with-point-constraints-ii,"['Array', 'Math', 'Binary Indexed Tree', 'Segment Tree', 'Geometry', 'Sorting']"
34+
minimum-operations-to-make-array-values-equal-to-k,"['Array', 'Hash Table']"
35+
minimum-time-to-break-locks-i,"['Array', 'Dynamic Programming', 'Backtracking', 'Bit Manipulation', 'Depth-First Search', 'Bitmask']"
36+
digit-operations-to-make-two-integers-equal,"['Math', 'Graph', 'Heap (Priority Queue)', 'Number Theory', 'Shortest Path']"
37+
count-connected-components-in-lcm-graph,"['Array', 'Hash Table', 'Math', 'Union Find', 'Number Theory']"
38+
smallest-number-with-all-set-bits,"['Math', 'Bit Manipulation']"
39+
identify-the-largest-outlier-in-an-array,"['Array', 'Hash Table', 'Counting', 'Enumeration']"
40+
maximize-the-number-of-target-nodes-after-connecting-trees-i,"['Tree', 'Depth-First Search', 'Breadth-First Search']"
41+
maximize-the-number-of-target-nodes-after-connecting-trees-ii,"['Tree', 'Depth-First Search', 'Breadth-First Search']"

‎utils/queries.py‎

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
def contest_ranking_query(page):
2-
"""Returns the json argument in request for contest ranking query"""
2+
"""Json argument in request for contest ranking query"""
33
query = f"""
44
query {{
55
globalRanking(page: {page}) {{
@@ -18,3 +18,32 @@ def contest_ranking_query(page):
1818
}}
1919
}}"""
2020
return {"query": query}
21+
22+
23+
def contest_problems_query(page_num):
24+
"""Json argument in request for contest problems query"""
25+
query = f"""
26+
query {{
27+
pastContests(pageNo: {page_num}, numPerPage: 10) {{
28+
data {{
29+
titleSlug
30+
questions {{
31+
titleSlug
32+
}}
33+
}}
34+
}}
35+
}}"""
36+
return {"query": query}
37+
38+
39+
def problem_tags_query(title):
40+
"""Json argument in request for problem tags query"""
41+
query = f"""
42+
query {{
43+
question(titleSlug: "{title}") {{
44+
topicTags {{
45+
name
46+
}}
47+
}}
48+
}}"""
49+
return {"query": query}

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /