-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathplot-data.py
More file actions
83 lines (66 loc) · 2.74 KB
/
Copy pathplot-data.py
File metadata and controls
83 lines (66 loc) · 2.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3
import pandas as pd
import matplotlib.pyplot as plt
import jsonlines
import sys
import argparse
# Usage:
# ./plot-data.py \
# --title "Common Crawl Google Scholar Citations per Year" \
# --groupby "year" \
# --xlabel "Year" \
# --ylabel "N Citations" \
# --output "citations-by-year.png" \
# --format "jsonl" \
# < data.jsonl
# Alternatively for CSV data as input:
# ./plot-data.py \
# --title "Common Crawl Google Scholar Citations per Year" \
# --groupby "year" \
# --xlabel "Year" \
# --ylabel "N Citations" \
# --output "citations-by-year.png" \
# --format "csv" \
# --transparent \
# < data.csv
parser = argparse.ArgumentParser()
parser.add_argument('--title', default='Graph Title')
parser.add_argument('--xlabel', default='X Label')
parser.add_argument('--ylabel', default='Y Label')
parser.add_argument('--output', default='output.png')
parser.add_argument('--groupby', default='year')
parser.add_argument('--transparent', action='store_true', help='Save plot with transparent background')
parser.add_argument('--format', choices=['jsonl', 'csv'], default='jsonl', help='input data format')
args = parser.parse_args()
outputfile = args.output
ccblue = '#3287c5'
if args.format == 'jsonl':
data = []
with jsonlines.Reader(sys.stdin) as reader:
for obj in reader:
data.append(obj)
df = pd.DataFrame(data)
elif args.format == 'csv':
df = pd.read_csv(sys.stdin)
# # debug: verify df structure
# print("DataFrame columns:", df.columns)
# print("DataFrame head:\n", df.head())
plt.figure(figsize=(20, 12))
bars = plt.bar(df['year'].astype(str), df['count'], color=ccblue)
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2.0, yval, int(yval), va='bottom', ha='center')
plt.title(args.title)
plt.xlabel(args.xlabel)
plt.ylabel(args.ylabel)
plt.xticks()
plt.grid(False)
ax = plt.gca()
ax.patch.set_alpha(0)
plt.gcf().patch.set_alpha(0 if args.transparent else 1)
for spine in ax.spines.values():
spine.set_visible(False)
ax.tick_params(left=False, bottom=False)
plt.savefig(outputfile, transparent=args.transparent, dpi=320, bbox_inches='tight')
print(f"Total rows: {len(df)}")
print(f"Plot saved to {outputfile}")