Skip to content

Commit fd8e294

Browse files
Add Flickr fetch script with workflow integration
Fetches CC license data for all 8 license types from Flickr API. Includes GitHub Actions integration and CSV output. Resolves #164
1 parent 19249f8 commit fd8e294

File tree

2 files changed

+229
-0
lines changed

2 files changed

+229
-0
lines changed

.github/workflows/1-fetch.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,14 @@ jobs:
6767
--enable-save --enable-git
6868
env:
6969
GH_TOKEN: ${{ secrets.BOT_TOKEN }}
70+
71+
# CC Technology team members:
72+
# See cc-quantifying-bot Flickr entry in Bitwarden for information
73+
# on FLICKR_ secrets
74+
- name: Fetch from Flickr
75+
run: |
76+
./scripts/1-fetch/flickr_fetch.py \
77+
--limit=100 --enable-save --enable-git
78+
env:
79+
FLICKR_API_KEY: ${{ secrets.FLICKR_API_KEY }}
80+
FLICKR_API_SECRET: ${{ secrets.FLICKR_API_SECRET }}

scripts/1-fetch/flickr_fetch.py

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
#!/usr/bin/env python
2+
"""
3+
Fetch CC photo license data from Flickr API for quarterly analysis.
4+
"""
5+
6+
# Standard library
7+
import argparse
8+
import csv
9+
import json
10+
import os
11+
import sys
12+
import time
13+
import traceback
14+
15+
# Third-party
16+
import flickrapi
17+
from dotenv import load_dotenv
18+
from pygments import highlight
19+
from pygments.formatters import TerminalFormatter
20+
from pygments.lexers import PythonTracebackLexer
21+
22+
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
23+
# First-party/Local
24+
import shared # noqa: E402
25+
26+
LOGGER, PATHS = shared.setup(__file__)
27+
load_dotenv(PATHS["dotenv"])
28+
29+
FLICKR_API_KEY = os.getenv("FLICKR_API_KEY")
30+
FLICKR_API_SECRET = os.getenv("FLICKR_API_SECRET")
31+
FILE1_COUNT = os.path.join(PATHS["data_phase"], "flickr_1_count.csv")
32+
HEADER1_COUNT = ["LICENSE_ID", "LICENSE_NAME", "COUNT"]
33+
QUARTER = os.path.basename(PATHS["data_quarter"])
34+
35+
# flickr.photos.licenses.getInfo API
36+
FLICKR_LICENSES = {
37+
1: "CC BY-NC-SA 2.0",
38+
2: "CC BY-NC 2.0",
39+
3: "CC BY-NC-ND 2.0",
40+
4: "CC BY 2.0",
41+
5: "CC BY-SA 2.0",
42+
6: "CC BY-ND 2.0",
43+
9: "Public Domain Dedication (CC0)",
44+
10: "Public Domain Mark",
45+
}
46+
47+
CC_LICENSES = [1, 2, 3, 4, 5, 6, 9, 10]
48+
49+
LOGGER.info("Script execution started.")
50+
51+
52+
def parse_arguments():
53+
LOGGER.info("Parsing command-line options")
54+
parser = argparse.ArgumentParser(description=__doc__)
55+
parser.add_argument(
56+
"--limit",
57+
type=int,
58+
default=100,
59+
help="Limit number of photos per license (default: 100)",
60+
)
61+
parser.add_argument(
62+
"--enable-save",
63+
action="store_true",
64+
help="Enable saving results",
65+
)
66+
parser.add_argument(
67+
"--enable-git",
68+
action="store_true",
69+
help="Enable git actions (fetch, merge, add, commit, and push)",
70+
)
71+
parser.add_argument(
72+
"--dev",
73+
action="store_true",
74+
help="Development mode: generate fake data without API calls",
75+
)
76+
args = parser.parse_args()
77+
if not args.enable_save and args.enable_git:
78+
parser.error("--enable-git requires --enable-save")
79+
return args
80+
81+
82+
def get_flickr_api():
83+
LOGGER.info("Setting up Flickr API")
84+
85+
if not FLICKR_API_KEY or not FLICKR_API_SECRET:
86+
raise shared.QuantifyingException(
87+
"Missing Flickr API credentials. Check your .env file."
88+
)
89+
90+
return flickrapi.FlickrAPI(
91+
FLICKR_API_KEY,
92+
FLICKR_API_SECRET,
93+
format="json",
94+
)
95+
96+
97+
def fetch_license_count(flickr, license_id, limit=100):
98+
"""Fetch photo count for a specific license from Flickr API."""
99+
license_name = FLICKR_LICENSES.get(license_id, "Unknown")
100+
LOGGER.info(f"Fetching count for license {license_id}: {license_name}")
101+
102+
try:
103+
photos_json = flickr.photos.search(
104+
license=license_id, per_page=min(limit, 500), page=1
105+
)
106+
107+
photos_data = json.loads(photos_json.decode("utf-8"))
108+
109+
if "photos" in photos_data and "total" in photos_data["photos"]:
110+
total = int(photos_data["photos"]["total"])
111+
count = min(total, limit)
112+
LOGGER.info(f" Found {count} photos (total available: {total})")
113+
return count
114+
else:
115+
LOGGER.warning(f" No data returned for license {license_id}")
116+
return 0
117+
118+
except Exception as e:
119+
LOGGER.error(f" Failed to fetch count for license {license_id}: {e}")
120+
return 0
121+
122+
123+
def generate_fake_data(args):
124+
"""Generate fake data for dev mode."""
125+
LOGGER.info("Creating fake data for dev mode")
126+
127+
counts = {}
128+
base = args.limit // len(CC_LICENSES)
129+
for idx, license_id in enumerate(CC_LICENSES):
130+
counts[license_id] = base + (license_id * 10) + (idx * 5)
131+
132+
return counts
133+
134+
135+
def save_data(args, license_counts):
136+
"""Save license count data to CSV file."""
137+
if not args.enable_save:
138+
LOGGER.info("Save disabled, skipping file write")
139+
return
140+
141+
LOGGER.info(f"Writing data to {FILE1_COUNT}")
142+
143+
data_rows = []
144+
for license_id, count in license_counts.items():
145+
data_rows.append(
146+
{
147+
"LICENSE_ID": license_id,
148+
"LICENSE_NAME": FLICKR_LICENSES[license_id],
149+
"COUNT": count,
150+
}
151+
)
152+
153+
data_rows.sort(key=lambda x: x["LICENSE_ID"])
154+
155+
with open(FILE1_COUNT, "w", newline="", encoding="utf-8") as csvfile:
156+
writer = csv.DictWriter(csvfile, fieldnames=HEADER1_COUNT)
157+
writer.writeheader()
158+
writer.writerows(data_rows)
159+
160+
LOGGER.info(f"Successfully wrote {len(data_rows)} records")
161+
162+
163+
def main():
164+
try:
165+
args = parse_arguments()
166+
167+
if args.enable_git:
168+
shared.git_fetch_and_merge(args, PATHS["repo"])
169+
170+
license_counts = {}
171+
172+
if args.dev:
173+
license_counts = generate_fake_data(args)
174+
else:
175+
flickr = get_flickr_api()
176+
177+
for license_id in CC_LICENSES:
178+
count = fetch_license_count(flickr, license_id, args.limit)
179+
license_counts[license_id] = count
180+
time.sleep(0.1)
181+
182+
save_data(args, license_counts)
183+
184+
if args.enable_git:
185+
args = shared.git_add_and_commit(
186+
args,
187+
PATHS["repo"],
188+
PATHS["data_quarter"],
189+
f"Add Flickr data for {QUARTER}",
190+
)
191+
shared.git_push_changes(args, PATHS["repo"])
192+
193+
total_photos = sum(license_counts.values())
194+
LOGGER.info(f"Done. Total photos across all licenses: {total_photos}")
195+
196+
for license_id in sorted(license_counts.keys()):
197+
count = license_counts[license_id]
198+
license_name = FLICKR_LICENSES[license_id]
199+
LOGGER.info(
200+
f" License {license_id} ({license_name}): {count} photos"
201+
)
202+
203+
except shared.QuantifyingException as e:
204+
LOGGER.error(f"Error: {e}")
205+
sys.exit(1)
206+
except Exception as e:
207+
LOGGER.error(f"Unexpected error: {e}")
208+
if LOGGER.isEnabledFor(10):
209+
traceback_str = traceback.format_exc()
210+
highlighted_traceback = highlight(
211+
traceback_str, PythonTracebackLexer(), TerminalFormatter()
212+
)
213+
print(highlighted_traceback)
214+
sys.exit(1)
215+
216+
217+
if __name__ == "__main__":
218+
main()

0 commit comments

Comments
 (0)