Skip to content

Commit 53ac8f7

Browse files
committed
Create deduplicate.py
1 parent 3e0ea21 commit 53ac8f7

File tree

1 file changed

+99
-0
lines changed

1 file changed

+99
-0
lines changed

scripts/deduplicate.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# Replaces duplicate files with symlinks
2+
import sys
3+
import traceback
4+
import os
5+
from pathlib import Path
6+
import hashlib
7+
import json
8+
from shutil import copyfile
9+
10+
EURO = [
11+
"b9f4175382a404007e19d3566061e36c",
12+
"9076ddd6ddf0bffc24e6ac71c1353d33",
13+
"1d7fb4e154e7198dfb39d16d9800844d"
14+
]
15+
16+
def count(path):
17+
count = {
18+
"files": 0,
19+
"links": 0,
20+
"unique": 0,
21+
"hashes": []
22+
}
23+
for root, dirs, files in os.walk(path):
24+
for file in files:
25+
if os.path.islink(os.path.join(root, file)):
26+
count["links"] = count["links"] + 1
27+
elif os.path.isfile(os.path.join(root, file)):
28+
count["files"] = count["files"] + 1
29+
h = hashlib.md5(open(os.path.join(root, file), "rb").read()).hexdigest()
30+
count["hashes"].append(h)
31+
count["hashes"] = list(set(count["hashes"]))
32+
count["unique"] = len(count["hashes"])
33+
return count
34+
35+
36+
def main():
37+
# Gets the full path of the www/l directory.
38+
path = Path().resolve().parent.joinpath("www", "l")
39+
hashes = set(count(str(path))["hashes"])
40+
print(count(str(path)))
41+
# Stores the original files
42+
original_files = {}
43+
for root, dirs, files in os.walk(path):
44+
for file in files:
45+
f = Path(root).joinpath(file)
46+
digest = hashlib.md5(open(str(f), "rb").read()).hexdigest()
47+
# Check if MD5 already contains in the list.
48+
if digest in original_files:
49+
# Save
50+
original_files[digest]["symlinks"].append(str(f))
51+
else:
52+
# Gets the relative path of the file,
53+
# www/l/by-nc/1.0/80x15.png --> by-nc/1.0/80x15.png
54+
relative = Path(f).relative_to(path)
55+
56+
# Joins the first and last part of the file
57+
# for moving the file to the parent license folder
58+
# because some files are under 2 sub-directories.
59+
# by-nc/1.0/80x15.png --> by-nc + 80x15.png
60+
parent = Path(path).joinpath(os.path.join(Path(relative).parts[0], Path(relative).parts[-1]))
61+
62+
if digest in EURO:
63+
parent = Path(str(parent).replace(".png", "-e.png"))
64+
65+
original_files[digest] = {
66+
"base": str(parent),
67+
"symlinks": [
68+
str(f)
69+
]
70+
}
71+
for value in original_files.values():
72+
# Copy one of the duplicated files to the parent folder
73+
# for creating symbolic link.
74+
if value["symlinks"][0] != value["base"]:
75+
copyfile(value["symlinks"][0], value["base"])
76+
# Delete all duplicated files and replace them with
77+
# symbolic link.
78+
for link in value["symlinks"]:
79+
if link != value["base"]:
80+
os.remove(link)
81+
os.symlink(value["base"], link)
82+
hashes2 = set(count(str(path))["hashes"])
83+
print(list(hashes - hashes2))
84+
print(count(str(path)))
85+
open("output.json", "w+").write(json.dumps(original_files, indent = 4))
86+
87+
88+
if __name__ == "__main__":
89+
try:
90+
main()
91+
except SystemExit as e:
92+
sys.exit(e.code)
93+
except KeyboardInterrupt:
94+
print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr)
95+
sys.exit(130)
96+
except Exception:
97+
print("ERROR (1) Unhandled exception:", file=sys.stderr)
98+
print(traceback.print_exc(), file=sys.stderr)
99+
sys.exit(1)

0 commit comments

Comments
 (0)