Skip to content

Commit 01f4e01

Browse files
committed
order/sort data
1 parent 631df48 commit 01f4e01

File tree

1 file changed

+43
-28
lines changed

1 file changed

+43
-28
lines changed

scripts/1-fetch/arxiv_fetch.py

Lines changed: 43 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import traceback
1414
import urllib.parse
1515
from collections import Counter, defaultdict
16+
from operator import itemgetter
1617

1718
# Third-party
1819
import feedparser
@@ -455,60 +456,74 @@ def save_count_data(
455456
# author_counts: {license: {author_count(int|None): count}}
456457

457458
# Save license counts
459+
data = []
460+
for lic, c in license_counts.items():
461+
data.append({"TOOL_IDENTIFIER": lic, "COUNT": c})
462+
data.sort(key=itemgetter("TOOL_IDENTIFIER"))
458463
with open(FILE_ARXIV_COUNT, "w", encoding="utf-8", newline="\n") as fh:
459464
writer = csv.DictWriter(fh, fieldnames=HEADER_COUNT, dialect="unix")
460465
writer.writeheader()
461-
for lic, c in license_counts.items():
462-
writer.writerow({"TOOL_IDENTIFIER": lic, "COUNT": c})
466+
for row in data:
467+
writer.writerow(row)
463468

464469
# Save category report with labels
470+
data = []
471+
for lic, cats in category_counts.items():
472+
for code, c in cats.items():
473+
label = CATEGORIES.get(code, code)
474+
data.append(
475+
{
476+
"TOOL_IDENTIFIER": lic,
477+
"CATEGORY_CODE": code,
478+
"CATEGORY_LABEL": label,
479+
"COUNT": c,
480+
}
481+
)
482+
data.sort(key=itemgetter("TOOL_IDENTIFIER", "CATEGORY_CODE"))
465483
with open(
466484
FILE_ARXIV_CATEGORY_REPORT, "w", encoding="utf-8", newline="\n"
467485
) as fh:
468486
writer = csv.DictWriter(
469487
fh, fieldnames=HEADER_CATEGORY_REPORT, dialect="unix"
470488
)
471489
writer.writeheader()
472-
for lic, cats in category_counts.items():
473-
for code, c in cats.items():
474-
label = CATEGORIES.get(code, code)
475-
writer.writerow(
476-
{
477-
"TOOL_IDENTIFIER": lic,
478-
"CATEGORY_CODE": code,
479-
"CATEGORY_LABEL": label,
480-
"COUNT": c,
481-
}
482-
)
490+
for row in data:
491+
writer.writerow(row)
483492

484493
# Save year counts
494+
data = []
495+
for lic, years in year_counts.items():
496+
for year, c in years.items():
497+
data.append({"TOOL_IDENTIFIER": lic, "YEAR": year, "COUNT": c})
498+
data.sort(key=itemgetter("TOOL_IDENTIFIER", "YEAR"))
485499
with open(FILE_ARXIV_YEAR, "w", encoding="utf-8", newline="\n") as fh:
486500
writer = csv.DictWriter(fh, fieldnames=HEADER_YEAR, dialect="unix")
487501
writer.writeheader()
488-
for lic, years in year_counts.items():
489-
for year, c in years.items():
490-
writer.writerow(
491-
{"TOOL_IDENTIFIER": lic, "YEAR": year, "COUNT": c}
492-
)
502+
for row in data:
503+
writer.writerow(row)
493504

494505
# Save author buckets summary
506+
data = []
507+
for lic, acs in author_counts.items():
508+
# build buckets across licenses
509+
bucket_counts = Counter()
510+
for ac, c in acs.items():
511+
b = bucket_author_count(ac)
512+
bucket_counts[b] += c
513+
for b, c in bucket_counts.items():
514+
data.append(
515+
{"TOOL_IDENTIFIER": lic, "AUTHOR_BUCKET": b, "COUNT": c}
516+
)
517+
data.sort(key=itemgetter("TOOL_IDENTIFIER", "AUTHOR_BUCKET"))
495518
with open(
496519
FILE_ARXIV_AUTHOR_BUCKET, "w", encoding="utf-8", newline="\n"
497520
) as fh:
498521
writer = csv.DictWriter(
499522
fh, fieldnames=HEADER_AUTHOR_BUCKET, dialect="unix"
500523
)
501524
writer.writeheader()
502-
# build buckets across licenses
503-
for lic, acs in author_counts.items():
504-
bucket_counts = Counter()
505-
for ac, c in acs.items():
506-
b = bucket_author_count(ac)
507-
bucket_counts[b] += c
508-
for b, c in bucket_counts.items():
509-
writer.writerow(
510-
{"TOOL_IDENTIFIER": lic, "AUTHOR_BUCKET": b, "COUNT": c}
511-
)
525+
for row in data:
526+
writer.writerow(row)
512527

513528

514529
def query_arxiv(args):

0 commit comments

Comments
 (0)