1+ # Replaces duplicate files with symlinks
2+ import sys
3+ import traceback
4+ import os
5+ from pathlib import Path
6+ import hashlib
7+ import json
8+ from shutil import copyfile
9+
10+ EURO = [
11+ "b9f4175382a404007e19d3566061e36c" ,
12+ "9076ddd6ddf0bffc24e6ac71c1353d33" ,
13+ "1d7fb4e154e7198dfb39d16d9800844d"
14+ ]
15+
16+ def count (path ):
17+ count = {
18+ "files" : 0 ,
19+ "links" : 0 ,
20+ "unique" : 0 ,
21+ "hashes" : []
22+ }
23+ for root , dirs , files in os .walk (path ):
24+ for file in files :
25+ if os .path .islink (os .path .join (root , file )):
26+ count ["links" ] = count ["links" ] + 1
27+ elif os .path .isfile (os .path .join (root , file )):
28+ count ["files" ] = count ["files" ] + 1
29+ h = hashlib .md5 (open (os .path .join (root , file ), "rb" ).read ()).hexdigest ()
30+ count ["hashes" ].append (h )
31+ count ["hashes" ] = list (set (count ["hashes" ]))
32+ count ["unique" ] = len (count ["hashes" ])
33+ return count
34+
35+
36+ def main ():
37+ # Gets the full path of the www/l directory.
38+ path = Path ().resolve ().parent .joinpath ("www" , "l" )
39+ hashes = set (count (str (path ))["hashes" ])
40+ print (count (str (path )))
41+ # Stores the original files
42+ original_files = {}
43+ for root , dirs , files in os .walk (path ):
44+ for file in files :
45+ f = Path (root ).joinpath (file )
46+ digest = hashlib .md5 (open (str (f ), "rb" ).read ()).hexdigest ()
47+ # Check if MD5 already contains in the list.
48+ if digest in original_files :
49+ # Save
50+ original_files [digest ]["symlinks" ].append (str (f ))
51+ else :
52+ # Gets the relative path of the file,
53+ # www/l/by-nc/1.0/80x15.png --> by-nc/1.0/80x15.png
54+ relative = Path (f ).relative_to (path )
55+
56+ # Joins the first and last part of the file
57+ # for moving the file to the parent license folder
58+ # because some files are under 2 sub-directories.
59+ # by-nc/1.0/80x15.png --> by-nc + 80x15.png
60+ parent = Path (path ).joinpath (os .path .join (Path (relative ).parts [0 ], Path (relative ).parts [- 1 ]))
61+
62+ if digest in EURO :
63+ parent = Path (str (parent ).replace (".png" , "-e.png" ))
64+
65+ original_files [digest ] = {
66+ "base" : str (parent ),
67+ "symlinks" : [
68+ str (f )
69+ ]
70+ }
71+ for value in original_files .values ():
72+ # Copy one of the duplicated files to the parent folder
73+ # for creating symbolic link.
74+ if value ["symlinks" ][0 ] != value ["base" ]:
75+ copyfile (value ["symlinks" ][0 ], value ["base" ])
76+ # Delete all duplicated files and replace them with
77+ # symbolic link.
78+ for link in value ["symlinks" ]:
79+ if link != value ["base" ]:
80+ os .remove (link )
81+ os .symlink (value ["base" ], link )
82+ hashes2 = set (count (str (path ))["hashes" ])
83+ print (list (hashes - hashes2 ))
84+ print (count (str (path )))
85+ open ("output.json" , "w+" ).write (json .dumps (original_files , indent = 4 ))
86+
87+
88+ if __name__ == "__main__" :
89+ try :
90+ main ()
91+ except SystemExit as e :
92+ sys .exit (e .code )
93+ except KeyboardInterrupt :
94+ print ("INFO (130) Halted via KeyboardInterrupt." , file = sys .stderr )
95+ sys .exit (130 )
96+ except Exception :
97+ print ("ERROR (1) Unhandled exception:" , file = sys .stderr )
98+ print (traceback .print_exc (), file = sys .stderr )
99+ sys .exit (1 )
0 commit comments