#!/usr/bin/env python3
# vim: set fileencoding=utf-8:
"""Normalize file and add/update the language list at the bottom of all CC4
legalcode files.
"""
# Copyright 2016, 2017 Creative Commons
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
# Standard library
from collections import OrderedDict
import argparse
import difflib
import glob
import os.path
import re
import sys
import traceback
# Local/library specific
import lang_tag_to
COMMENTS = OrderedDict(
{
"head_start": {
"label": "Head Start",
"regex": re.compile(
r"""(?P
/errata[.]js['"]>\s*
)""",
re.IGNORECASE | re.VERBOSE,
),
"include": "html-head.html",
},
"head_end": {
"label": "Head End",
"regex": re.compile(
r"""(?P
\s*
)""",
re.IGNORECASE | re.VERBOSE,
),
},
"site_header_start": {
"label": "Site Header Start",
"regex": re.compile(
r"""(?P
]+>\s
)""",
re.IGNORECASE | re.VERBOSE,
),
"include": "site-header.html",
},
"site_header_end": {
"label": "Site Header End",
"regex": re.compile(
r"""(?P
\s
)""",
re.IGNORECASE | re.VERBOSE,
),
},
"language_selector_end": {
"label": "Language Selector End",
"regex": re.compile(
r"""(?P
\s
)""",
re.IGNORECASE | re.VERBOSE,
),
},
"legalcode_end": {
"label": "Legalcode End",
"regex": re.compile(
r"""(?P
\s*.*
)""",
re.IGNORECASE | re.VERBOSE,
),
},
"language_footer_start": {
"label": "Language Footer Start",
"regex": re.compile(
r"""(?P
[^<]+\s
)""",
re.IGNORECASE | re.VERBOSE,
),
},
"language_footer_end": {
"label": "Language Footer End",
"regex": re.compile(
r"""(?P
(?<=
# \u3002 is ideographic full stop
[.\u3002]
)
\s*[^<]+
)""",
re.IGNORECASE | re.VERBOSE,
),
},
"site_footer_start": {
"label": "Site Footer Start",
"regex": re.compile(
r"""(?P
\s*
\s*
)""",
re.IGNORECASE | re.VERBOSE,
),
"include": "site-footer.html",
},
"site_footer_end": {
"label": "Site Footer End",
"regex": re.compile(
r"""(?P
\s*
)""",
re.IGNORECASE | re.VERBOSE,
),
},
}
)
FAQ_TRANSLATION_LINK = "/faq/#officialtranslations"
class ToolError(Exception):
def __init__(self, message, code=None):
self.code = code if code else 1
message = "({}) {}".format(self.code, message)
super(ToolError, self).__init__(message)
def diff_changes(filename, old, new):
"""Display changes as a colorized unified diff.
"""
diff = list(
difflib.unified_diff(
old.split("\n"),
new.split("\n"),
fromfile=f"{filename}: current",
tofile=f"{filename}: proposed",
n=3,
)
)
if not diff:
return
# Color diff output
rst = "\033[0m"
for i, line in enumerate(diff):
if line.startswith("---"):
diff[i] = f"\033[91m{line.rstrip()}{rst}"
elif line.startswith("+++"):
diff[i] = f"\033[92m{line.rstrip()}{rst}"
elif line.startswith("@"):
diff[i] = f"\033[36m{line.rstrip()}{rst}"
elif line.startswith("-"):
diff[i] = f"\033[31m{line}{rst}"
elif line.startswith("+"):
diff[i] = f"\033[32m{line}{rst}"
else:
diff[i] = f"\033[90m{line}{rst}"
print("\n".join(diff))
def update_include(args, filename, content, section):
label = COMMENTS[f"{section}_start"]["label"]
start = f""
label = COMMENTS[f"{section}_end"]["label"]
end = f""
include_file = COMMENTS[f"{section}_start"]["include"]
include = os.path.join(sys.path[0], "legalcode-includes", include_file)
with open(include, "r", encoding="utf-8") as file_in:
includetext = file_in.read()
target = re.search(f"{start}.*{end}", content, re.DOTALL).group()
replacement = f"{start}\n{includetext}\n{end}"
if target == replacement:
print(f"{filename}: Skipping unneeded update of {section} include")
else:
print(f"{filename}: Updating {section} include")
new_content = content.replace(target, replacement, 1)
if args.debug:
diff_changes(filename, content, new_content)
if new_content is None:
sys.exit(1)
return new_content
def update_lang_selector(args, filename, content, lang_tags):
"""Replace the contents of the language selector (everything between
language_selector_start and language_selector_end HTML comments) with a
list of links based on the legalcode files currently being processed.
"""
current_language = lang_tags_from_filenames(filename)[0]
selector = (
''
'\n
'
f"\n{lang_tag_to.SELECT_TEXT[current_language]}"
'\n
'
"\n
"
)
for lang_tag in lang_tags:
selected = ""
if lang_tag == current_language:
selected = ' selected="selected"'
# Determine to option value for the language. English breaks the
# pattern so handle it differently.
if lang_tag == "en":
index = "legalcode"
else:
index = f"legalcode.{lang_tag}"
# Add the selector vlaue
selector = (
f"{selector}\n"
f''
f"{lang_tag_to.LABEL[lang_tag]} "
)
selector = f"{selector}\n \n
\n
"
# Update the language selector block to the content
label = COMMENTS["language_selector_start"]["label"]
start = f""
label = COMMENTS["language_selector_end"]["label"]
end = f""
target = re.search(f"{start}.*{end}", content, re.DOTALL).group()
replacement = f"{start}\n{selector}\n{end}"
if target == replacement:
print(
f"{filename}: Skipping unneeded update of language"
" selector options"
)
else:
print(f"{filename}: Updating language selector options")
new_content = content.replace(target, replacement, 1)
if args.debug:
diff_changes(filename, content, new_content)
if new_content is None:
sys.exit(1)
return new_content
def update_lang_footer(args, filename, content, lang_tags):
"""Replace the contents of the language footer (everything between
language_footer_start and language_footer_end HTML comments) with a
list of links based on the legalcode files currently being processed.
"""
current_language = lang_tags_from_filenames(filename)[0]
license_type = filename.split("_")[0]
footer = ""
for lang_tag in lang_tags:
if lang_tag == current_language:
continue
# Determine to option value for the language. English breaks the
# pattern so handle it differently.
if lang_tag == "en":
index = "legalcode"
else:
index = f"legalcode.{lang_tag}"
link = (
f''
f"{lang_tag_to.LABEL[lang_tag]} ,\n"
)
footer = f"{footer}{link}"
footer = footer.rstrip(",\n")
# Update the language footer block to the content
label = COMMENTS["language_footer_start"]["label"]
start = f""
label = COMMENTS["language_footer_end"]["label"]
end = f""
target = re.search(f"{start}.*{end}", content, re.DOTALL).group()
if current_language in ["ja", "zh-Hans", "zh-Hant"]:
# Use ideographic full stop ("。")
period = "\u3002"
else:
# Use ASCII period
period = "."
replacement = f"{start}\n{footer}{period}\n{end}"
if target == replacement:
print(
f"{filename}: Skipping unneeded update of language footer"
" links"
)
else:
print(f"{filename}: Updating language footer links")
new_content = content.replace(target, replacement, 1)
if args.debug:
diff_changes(filename, content, new_content)
if new_content is None:
sys.exit(1)
return new_content
def insert_missing_comment(args, filename, content, comment_dict):
"""Insert the comment in the appropriate locations, if it is not already
present.
"""
label = comment_dict["label"]
comment = f""
regex = comment_dict["regex"]
if not content.find(comment) == -1:
print(
f"{filename}: Skipping unneeded {label} HTML comment insertion"
)
return content
print(f"{filename}: inserting {label } HTML comment")
matches = regex.search(content)
if matches is None:
print(
f"{filename}: ERROR: {label} insertion point not matched. Aborting"
" processing"
)
sys.exit(1)
target = matches.group("target")
if " start" in label.lower():
# Start comments are inserted after target regex
target_new = target.rstrip()
replacement = f"{target_new}\n{comment}\n"
else:
# End comments are inserted before target regex
target_new = target.lstrip("\n")
replacement = f"\n{comment}\n{target_new}"
new_content = content.replace(target, replacement, 1)
if args.debug:
diff_changes(filename, content, new_content)
if new_content is None:
sys.exit(1)
return new_content
def has_correct_faq_officialtranslations(content):
"""Determine if the link to the translation FAQ is correct.
"""
if content.find(f'"{FAQ_TRANSLATION_LINK}"') == -1:
return False
return True
def normalize_faq_translation_link(args, filename, content):
"""Replace various incorrect translation FAQ links with the correct link
(FAQ_TRANSLATION_LINK).
"""
if has_correct_faq_officialtranslations(content):
print(
f"{filename}: Skipping unneeded translation FAQ link"
" normalization"
)
return content
print(f"{filename}: normalizing translation FAQ link")
re_pattern = re.compile(
r"""
(?P
href=['"]
)
(?P
# Matches various translation FAQ URLs
[^'"]*/[Ff][Aa][Qq]/?[#][^'"]*
)
(?P
['"]
)
""",
re.DOTALL | re.MULTILINE | re.VERBOSE,
)
matches = re_pattern.search(content)
if matches is None:
print(
f"{filename}: ERROR: translation link not matched. Aborting"
" processing"
)
sys.exit(1)
target = matches.group("target")
replacement = FAQ_TRANSLATION_LINK
new_content = content.replace(target, replacement, 1)
if args.debug:
diff_changes(filename, content, new_content)
if new_content is None:
sys.exit(1)
return new_content
def has_correct_languages_anchor(content):
"""Determine if language anchor uses id
"""
if content.find('id="languages"') == -1:
return False
return True
def normalize_languages_anchor(args, filename, content):
"""Replace name with id in languages anchor (HTML5 compatibility)
"""
if has_correct_languages_anchor(content):
print(
f"{filename}: Skipping unneeded language anchor normalization"
)
return content
print(f"{filename}: normalizing language anchor id")
re_pattern = re.compile("name=['\"]languages['\"]", re.IGNORECASE)
matches = re_pattern.search(content)
if matches is None:
print(
f"{filename}: ERROR: languages anchor not matched. Aborting"
" processing"
)
sys.exit(1)
target = matches.group()
replacement = 'id="languages"'
new_content = content.replace(target, replacement, 1)
if args.debug:
diff_changes(filename, content, new_content)
if new_content is None:
sys.exit(1)
return new_content
def normalize_line_endings(args, filename, content):
"""Normalize line endings to unix LF (\\n)
"""
re_pattern = re.compile("\r(?!\n)")
matches = re_pattern.findall(content)
message = ""
if matches:
message = f" {len(matches)} mac newlines (CR)"
re_pattern = re.compile("\r\n")
matches = re_pattern.findall(content)
if matches:
if message:
message = f"{message} and"
message = f"{message} {len(matches)} windows newlines (CRLF)"
if message:
print(f"{filename}: Converting{message} to unix newlines (LF)")
return "\n".join(content.split("\r\n"))
else:
print(f"{filename}: Skipping unneeded newline conversion")
return content
def process_file_contents(args, file_list, lang_tags):
"""Process each of the CC4 legalcode files and update them, as necessary.
"""
for filename in file_list:
with open(filename, "r", encoding="utf-8", newline="") as file_in:
content = file_in.read()
new_content = content
new_content = normalize_line_endings(args, filename, new_content)
new_content = normalize_languages_anchor(args, filename, new_content)
new_content = normalize_faq_translation_link(
args, filename, new_content
)
for key in COMMENTS.keys():
new_content = insert_missing_comment(
args, filename, new_content, COMMENTS[key]
)
new_content = update_lang_selector(
args, filename, new_content, lang_tags
)
new_content = update_lang_footer(
args, filename, new_content, lang_tags
)
for section in ("head", "site_header", "site_footer"):
new_content = update_include(args, filename, new_content, section)
if content == new_content:
print(
f"{filename}: Skipping writing back to file (no changes)"
)
elif args.debug:
print(f"{filename}: DEBUG: Skipping writing changes to file")
else:
print(f"{filename}: Writing changes to file")
with open(filename, "w", encoding="utf-8") as file_out:
file_out.write(new_content)
print()
print()
def lang_tags_from_filenames(file_list):
"""Extract RFC 5646 language tags from filename(s)
"""
if isinstance(file_list, str):
lang_tags = [file_list.split(".")[1][2:]]
else:
lang_tags = list(
set([filename.split(".")[1][2:] for filename in file_list])
)
try:
lang_tags[lang_tags.index("")] = "en"
except ValueError:
pass
lang_tags.sort()
return lang_tags
def setup():
"""Instantiate and configure argparse and logging.
Return argsparse namespace.
"""
default_glob = ["by*4.0*.html"]
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument(
"-d",
"--debug",
action="store_true",
help="Debug mode: list changes without modification",
)
ap.add_argument(
"globs",
nargs="*",
default=default_glob,
help=(
"Filename or shell glob of the file(s) that will be updated"
f' (default: "{default_glob[0]}")'
),
metavar="FILENAME",
)
args = ap.parse_args()
return args
def main():
args = setup()
file_list = sorted(
list(
set(
[
filename
for fileglob in args.globs
for filename in glob.glob(fileglob)
if os.path.isfile(filename)
if not os.path.islink(filename)
]
)
)
)
lang_tags = lang_tags_from_filenames(file_list)
process_file_contents(args, file_list, lang_tags)
if __name__ == "__main__":
try:
main()
except SystemExit as e:
sys.exit(e.code)
except KeyboardInterrupt:
print("INFO (130) Halted via KeyboardInterrupt.", file=sys.stderr)
sys.exit(130)
except ToolError:
error_type, error_value, error_traceback = sys.exc_info()
print("CRITICAL {}".format(error_value), file=sys.stderr)
sys.exit(error_value.code)
except: # noqa: ignore flake8: E722 do not use bare 'except'
print("ERROR (1) Unhandled exception:", file=sys.stderr)
print(traceback.print_exc(), file=sys.stderr)
sys.exit(1)