#!/usr/bin/awk
#
# Assumes the following variables
# link = URL of corresponding HTML page
#
# Looks for entries of the form
# <li><span class=updated title="2009-02-06">...</span>... text
#
# Author: Bert Bos <bert@w3.org>
# Created: 6 Feb 2009

/^Aclass / {class = substr($0, 14)}
/^Atitle / {titleattr = substr($0, 14)}

/^\)span$/ && in_date {in_date--}
/^\)li$/ && in_item == 1 && date {items[n++] = date "\t" raw}
/^\)li$/ && in_item {in_item--}
/^\)title$/ {in_title--}

/^-/ && in_title {title = title substr($0, 2)}
/^-/ && in_item && !in_date {raw = raw substr($0, 2)}
/^-/ && in_date && !date {date = substr($0, 2)}

/^\(title$/ {in_title++}
/^\(li$/ && !in_item {date = ""; raw = ""}
/^\(li$/ {in_item++}
/^\(span$/ && in_item && class ~ /\<updated\>/ {date = titleattr}
/^\(span$/ && in_item && (class ~ /\<updated\>/ || in_date) {in_date++}
/^\(/ {class = ""; titleattr = ""}


END {
  sort(items, 0, n - 1);
  first = n < 15 ? 0 : n - 15;

  if (n == 0) newest_date = "1970-01-01";
  else newest_date = substr(items[n-1], 1, index(items[n-1], "\t") - 1);

  printf "?xml-stylesheet href=\"/2000/08/w3c-synd/style.css\"?\n-\\n\n";
  printf "Axmlns:rdf CDATA http://www.w3.org/1999/02/22-rdf-syntax-ns#\n";
  printf "Axmlns:dc CDATA http://purl.org/dc/elements/1.1/\n";
  printf "Axmlns CDATA http://purl.org/rss/1.0/\n";
  printf "(rdf:RDF\n-\\n\n";
  printf "Ardf:about CDATA \n(channel\n-\\n\n";
  printf "(title\n-%s\n)title\n-\\n\n", title;
  printf "(link\n-%s\n)link\n-\\n\n", link;
  printf "(dc:date\n-%s\n)dc:date\n-\\n\n", newest_date;
  printf "(items\n-\\n\n(rdf:Seq\n-\\n\n";
  for (i = n - 1; i >= first; i--)
    printf "Ardf:resource CDATA #x%d\n|rdf:li\n-\\n\n", i;
  printf ")rdf:Seq\n-\\n\n)items\n-\\n\n)channel\n-\\n\\n\n";

  for (i = n - 1; i >= first; i--) {

    split(items[i], x, "\t");
    date = x[1];
    desc = x[2];

    # Check if we have a date
    if (date !~ /^[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]$/) continue;

    # Clean up the description
    gsub(/\\n/, " ", desc);
    gsub(/\\t/, " ", desc);
    gsub(/  */, " ", desc);
    sub(/^ */, "", desc);

    # The title is the first 60 characters of the description
    t = substr(desc, 1, 60);
    if (t != desc) t = t "\\#x2026;";

    printf "Ardf:ID CDATA x%d\n(item\n-\\n\n", i;
    printf "(title\n-%s\n)title\n-\\n\n", t;
    printf "(description\n-%s\n)description\n-\\n\n", desc;
    printf "(link\n-%s\n)link\n-\\n\n", link;
    printf "(dc:date\n-%s\n)dc:date\n-\\n\n", date;
    printf ")item\n-\\n\\n\n";
  }
  printf ")rdf:RDF\n-\\n\n";
}
