#!/usr/bin/env python3 # vim: set fileencoding=utf-8: """Normalize file and add/update the language list at the bottom of all CC4 legalcode files. """ # Copyright 2016, 2017 Creative Commons # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . # Standard library from collections import OrderedDict import argparse import difflib import glob import os.path import re import sys import traceback # Local/library specific import lang_tag_to COMMENTS = OrderedDict( { "head_start": { "label": "Head Start", "regex": re.compile( r"""(?P /errata[.]js['"]>\s* )""", re.IGNORECASE | re.VERBOSE, ), "include": "html-head.html", }, "head_end": { "label": "Head End", "regex": re.compile( r"""(?P \s* )""", re.IGNORECASE | re.VERBOSE, ), }, "site_header_start": { "label": "Site Header Start", "regex": re.compile( r"""(?P ]+>\s )""", re.IGNORECASE | re.VERBOSE, ), "include": "site-header.html", }, "site_header_end": { "label": "Site Header End", "regex": re.compile( r"""(?P \s )""", re.IGNORECASE | re.VERBOSE, ), }, "language_selector_end": { "label": "Language Selector End", "regex": re.compile( r"""(?P \s )""", re.IGNORECASE | re.VERBOSE, ), }, "legalcode_end": { "label": "Legalcode End", "regex": re.compile( r"""(?P \s*.*

)""", re.IGNORECASE | re.VERBOSE, ), }, "language_footer_start": { "label": "Language Footer Start", "regex": re.compile( r"""(?P [^<]+\s )""", re.IGNORECASE | re.VERBOSE, ), }, "language_footer_end": { "label": "Language Footer End", "regex": re.compile( r"""(?P (?<= # \u3002 is ideographic full stop [.\u3002] ) \s*[^<]+ )""", re.IGNORECASE | re.VERBOSE, ), }, "site_footer_start": { "label": "Site Footer Start", "regex": re.compile( r"""(?P \s* <]+<]+

\s*\s* \s* )""", re.IGNORECASE | re.VERBOSE, ), "include": "site-footer.html", }, "site_footer_end": { "label": "Site Footer End", "regex": re.compile( r"""(?P \s* )""", re.IGNORECASE | re.VERBOSE, ), }, } ) FAQ_TRANSLATION_LINK = "/faq/#officialtranslations" class ToolError(Exception): def __init__(self, message, code=None): self.code = code if code else 1 message = "({}) {}".format(self.code, message) super(ToolError, self).__init__(message) def diff_changes(filename, old, new): """Display changes as a colorized unified diff. """ diff = list( difflib.unified_diff( old.split("\n"), new.split("\n"), fromfile=f"{filename}: current", tofile=f"{filename}: proposed", n=3, ) ) if not diff: return # Color diff output rst = "\033[0m" for i, line in enumerate(diff): if line.startswith("---"): diff[i] = f"\033[91m{line.rstrip()}{rst}" elif line.startswith("+++"): diff[i] = f"\033[92m{line.rstrip()}{rst}" elif line.startswith("@"): diff[i] = f"\033[36m{line.rstrip()}{rst}" elif line.startswith("-"): diff[i] = f"\033[31m{line}{rst}" elif line.startswith("+"): diff[i] = f"\033[32m{line}{rst}" else: diff[i] = f"\033[90m{line}{rst}" print("\n".join(diff)) def update_include(args, filename, content, section): label = COMMENTS[f"{section}_start"]["label"] start = f"" label = COMMENTS[f"{section}_end"]["label"] end = f"" include_file = COMMENTS[f"{section}_start"]["include"] include = os.path.join(sys.path[0], "legalcode-includes", include_file) with open(include, "r", encoding="utf-8") as file_in: includetext = file_in.read() target = re.search(f"{start}.*{end}", content, re.DOTALL).group() replacement = f"{start}\n{includetext}\n{end}" if target == replacement: print(f"{filename}: Skipping unneeded update of {section} include") else: print(f"{filename}: Updating {section} include") new_content = content.replace(target, replacement, 1) if args.debug: diff_changes(filename, content, new_content) if new_content is None: sys.exit(1) return new_content def update_lang_selector(args, filename, content, lang_tags): """Replace the contents of the language selector (everything between language_selector_start and language_selector_end HTML comments) with a list of links based on the legalcode files currently being processed. """ current_language = lang_tags_from_filenames(filename)[0] selector = ( '

' '\n

' f"\n{lang_tag_to.SELECT_TEXT[current_language]}" '\n

' "\n\n

" # Update the language selector block to the content label = COMMENTS["language_selector_start"]["label"] start = f"" label = COMMENTS["language_selector_end"]["label"] end = f"" target = re.search(f"{start}.*{end}", content, re.DOTALL).group() replacement = f"{start}\n{selector}\n{end}" if target == replacement: print( f"{filename}: Skipping unneeded update of language" " selector options" ) else: print(f"{filename}: Updating language selector options") new_content = content.replace(target, replacement, 1) if args.debug: diff_changes(filename, content, new_content) if new_content is None: sys.exit(1) return new_content def update_lang_footer(args, filename, content, lang_tags): """Replace the contents of the language footer (everything between language_footer_start and language_footer_end HTML comments) with a list of links based on the legalcode files currently being processed. """ current_language = lang_tags_from_filenames(filename)[0] license_type = filename.split("_")[0] footer = "" for lang_tag in lang_tags: if lang_tag == current_language: continue # Determine to option value for the language. English breaks the # pattern so handle it differently. if lang_tag == "en": index = "legalcode" else: index = f"legalcode.{lang_tag}" link = ( f'' f"{lang_tag_to.LABEL[lang_tag]},\n" ) footer = f"{footer}{link}" footer = footer.rstrip(",\n") # Update the language footer block to the content label = COMMENTS["language_footer_start"]["label"] start = f"" label = COMMENTS["language_footer_end"]["label"] end = f"" target = re.search(f"{start}.*{end}", content, re.DOTALL).group() if current_language in ["ja", "zh-Hans", "zh-Hant"]: # Use ideographic full stop ("。") period = "\u3002" else: # Use ASCII period period = "." replacement = f"{start}\n{footer}{period}\n{end}" if target == replacement: print( f"{filename}: Skipping unneeded update of language footer" " links" ) else: print(f"{filename}: Updating language footer links") new_content = content.replace(target, replacement, 1) if args.debug: diff_changes(filename, content, new_content) if new_content is None: sys.exit(1) return new_content def insert_missing_comment(args, filename, content, comment_dict): """Insert the comment in the appropriate locations, if it is not already present. """ label = comment_dict["label"] comment = f"" regex = comment_dict["regex"] if not content.find(comment) == -1: print( f"{filename}: Skipping unneeded {label} HTML comment insertion" ) return content print(f"{filename}: inserting {label } HTML comment") matches = regex.search(content) if matches is None: print( f"{filename}: ERROR: {label} insertion point not matched. Aborting" " processing" ) sys.exit(1) target = matches.group("target") if " start" in label.lower(): # Start comments are inserted after target regex target_new = target.rstrip() replacement = f"{target_new}\n{comment}\n" else: # End comments are inserted before target regex target_new = target.lstrip("\n") replacement = f"\n{comment}\n{target_new}" new_content = content.replace(target, replacement, 1) if args.debug: diff_changes(filename, content, new_content) if new_content is None: sys.exit(1) return new_content def has_correct_faq_officialtranslations(content): """Determine if the link to the translation FAQ is correct. """ if content.find(f'"{FAQ_TRANSLATION_LINK}"') == -1: return False return True def normalize_faq_translation_link(args, filename, content): """Replace various incorrect translation FAQ links with the correct link (FAQ_TRANSLATION_LINK). """ if has_correct_faq_officialtranslations(content): print( f"{filename}: Skipping unneeded translation FAQ link" " normalization" ) return content print(f"{filename}: normalizing translation FAQ link") re_pattern = re.compile( r""" (?P href=['"] ) (?P # Matches various translation FAQ URLs [^'"]*/[Ff][Aa][Qq]/?[#][^'"]* ) (?P ['"] ) """, re.DOTALL | re.MULTILINE | re.VERBOSE, ) matches = re_pattern.search(content) if matches is None: print( f"{filename}: ERROR: translation link not matched. Aborting" " processing" ) sys.exit(1) target = matches.group("target") replacement = FAQ_TRANSLATION_LINK new_content = content.replace(target, replacement, 1) if args.debug: diff_changes(filename, content, new_content) if new_content is None: sys.exit(1) return new_content def has_correct_languages_anchor(content): """Determine if language anchor uses id """ if content.find('id="languages"') == -1: return False return True def normalize_languages_anchor(args, filename, content): """Replace name with id in languages anchor (HTML5 compatibility) """ if has_correct_languages_anchor(content): print( f"{filename}: Skipping unneeded language anchor normalization" ) return content print(f"{filename}: normalizing language anchor id") re_pattern = re.compile("name=['\"]languages['\"]", re.IGNORECASE) matches = re_pattern.search(content) if matches is None: print( f"{filename}: ERROR: languages anchor not matched. Aborting" " processing" ) sys.exit(1) target = matches.group() replacement = 'id="languages"' new_content = content.replace(target, replacement, 1) if args.debug: diff_changes(filename, content, new_content) if new_content is None: sys.exit(1) return new_content def normalize_line_endings(args, filename, content): """Normalize line endings to unix LF (\\n) """ re_pattern = re.compile("\r(?!\n)") matches = re_pattern.findall(content) message = "" if matches: message = f" {len(matches)} mac newlines (CR)" re_pattern = re.compile("\r\n") matches = re_pattern.findall(content) if matches: if message: message = f"{message} and" message = f"{message} {len(matches)} windows newlines (CRLF)" if message: print(f"{filename}: Converting{message} to unix newlines (LF)") return "\n".join(content.split("\r\n")) else: print(f"{filename}: Skipping unneeded newline conversion") return content def process_file_contents(args, file_list, lang_tags): """Process each of the CC4 legalcode files and update them, as necessary. """ for filename in file_list: with open(filename, "r", encoding="utf-8", newline="") as file_in: content = file_in.read() new_content = content new_content = normalize_line_endings(args, filename, new_content) new_content = normalize_languages_anchor(args, filename, new_content) new_content = normalize_faq_translation_link( args, filename, new_content ) for key in COMMENTS.keys(): new_content = insert_missing_comment( args, filename, new_content, COMMENTS[key] ) new_content = update_lang_selector( args, filename, new_content, lang_tags ) new_content = update_lang_footer( args, filename, new_content, lang_tags ) for section in ("head", "site_header", "site_footer"): new_content = update_include(args, filename, new_content, section) if content == new_content: print( f"{filename}: Skipping writing back to file (no changes)" ) elif args.debug: print(f"{filename}: DEBUG: Skipping writing changes to file") else: print(f"{filename}: Writing changes to file") with open(filename, "w", encoding="utf-8") as file_out: file_out.write(new_content) print() print() def lang_tags_from_filenames(file_list): """Extract RFC 5646 language tags from filename(s) """ if isinstance(file_list, str): lang_tags = [file_list.split(".")[1][2:]] else: lang_tags = list( set([filename.split(".")[1][2:] for filename in file_list]) ) try: lang_tags[lang_tags.index("")] = "en" except ValueError: pass lang_tags.sort() return lang_tags def setup(): """Instantiate and configure argparse and logging. Return argsparse namespace. """ default_glob = ["by*4.0*.html"] ap = argparse.ArgumentParser(description=__doc__) ap.add_argument( "-d", "--debug", action="store_true", help="Debug mode: list changes without modification", ) ap.add_argument( "globs", nargs="*", default=default_glob, help=( "Filename or shell glob of the file(s) that will be updated" f' (default: "{default_glob[0]}")' ), metavar="FILENAME", ) args = ap.parse_args() return args def main(): args = setup() file_list = sorted( list( set( [ filename for fileglob in args.globs for filename in glob.glob(fileglob) if os.path.isfile(filename) if not os.path.islink(filename) ] ) ) ) lang_tags = lang_tags_from_filenames(file_list) process_file_contents(args, file_list, lang_tags) if __name__ == "__main__": try: main() except SystemExit as e: sys.exit(e.code) except KeyboardInterrupt: print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr) sys.exit(130) except ToolError: error_type, error_value, error_traceback = sys.exc_info() print("CRITICAL {}".format(error_value), file=sys.stderr) sys.exit(error_value.code) except: # noqa: ignore flake8: E722 do not use bare 'except' print("ERROR (1) Unhandled exception:", file=sys.stderr) print(traceback.print_exc(), file=sys.stderr) sys.exit(1)