#!/usr/bin/awk
#
# Assumes the following variables
# url = URL of generated Atom feed
# link = URL of corresponding HTML page
#
# Looks for entries of the form
# <li><span class=updated>2009-02-06</span>...
#
# Author: Bert Bos <bert@w3.org>
# Created: 6 Feb 2009

/^Aclass / {class = substr($0, 12)}
/^Aid / {id = $3}

/^\)span$/ && in_date {in_date--}
/^\)li$/ && in_item == 1 {items[n++] = date "\t" raw "\t" content "\t" itemid}
/^\)li$/ && in_item {in_item--}
/^\)title$/ {in_title--}

/^-/ && in_title {title = title substr($0, 2)}
/^-/ && in_item && !in_date {raw = raw substr($0, 2)}
/^-/ && in_date {date = date substr($0, 2)}

in_item {content = content $0 "\n"}

/^\(title$/ {in_title++}
/^\(li$/ && id && !in_item {itemid = id; content = ""; date = ""; raw = ""}
/^\(li$/ && (id || in_item) {in_item++}
/^\(span$/ && in_item && class ~ /\<updated\>/ {in_date++}
/^\(/ {id = ""; class = ""}


END {
  sort(items, 0, n - 1);
  first = n < 15 ? 0 : n - 15;

  if (n == 0) newest_date = "1970-01-01";
  else newest_date = substr(items[n-1], 1, index(items[n-1], "\t") - 1);

  printf "?xml-stylesheet href=\"/StyleSheets/atom.css\"?\n-\\n\n";
  printf "Axmlns CDATA http://www.w3.org/2005/Atom\nAxml:lang CDATA en\n(feed\n-\\n\n";
  printf "(id\n-%s\n)id\n-\\n\n", url;
  printf "(title\n-%s\n)title\n-\\n\n", title;
  printf "(updated\n-%sT00:00:00Z\n)updated\n-\\n\n", newest_date;
  printf "Ahref CDATA %s\n|link\n-\\n\n", link;
  printf "Arel CDATA self\nAhref CDATA %s\n|link\n-\\n\n", url;
  printf "(author\n(name\n-W3C\n)name\n)author\n-\\n\\n\n";

  for (i = n - 1; i >= first; i--) {

    split(items[i], x, "\t");
    date = x[1];
    raw = x[2];
    content = x[3];
    id = x[4];

    # Check if we have a date
    if (date !~ /^[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]$/) continue;

    # Create a (truncated) title from the raw contents
    gsub(/\\n/, " ", raw);
    gsub(/\\t/, " ", raw);
    gsub(/\\r/, " ", raw);
    gsub(/  */, " ", raw);
    sub(/^ */, "", raw);
    gsub(/\\\\/, "\\", raw);
    t = substr(raw, 1, 60);
    if (t != raw) t = t "…";
    gsub(/\\/, "\\\\", t);

    # Create a (hopefully unique) ID from the title
    if (!id) {
	id = t;
	gsub(/\\[#x0-9a-fA-F]*;/, "_", id);
	gsub(/[^a-zA-Z0-9 _-]/, "", id);
	gsub(/ /, "_", id);
    }

    printf "(entry\n-\\n\n";
    printf "(title\n-%s\n)title\n-\\n\n", t;
    printf "Ahref CDATA %s#%s\n|link\n-\\n\n", link, id;
    printf "(id\n-%s#%s\n)id\n-\\n\n", link, id;
    printf "(updated\n-%sT00:00:00Z\n)updated\n-\\n\n", date;
    printf "Atype CDATA xhtml\n(content\n-\\n\n";
    printf "(div\n%s)div\n-\\n\n", content;
    printf ")content\n-\\n\n";
    printf ")entry\n-\\n\\n\n";
  }
  printf ")feed\n-\\n\n";
}
