#!/usr/local/bin/perl -w # # Extract info from LINK and META and put them in an RDF file # # Reads from 'file' (default stdin) and writes to 'output' # (default stdout). The URL of the input is assumed to be URL # (default 'file'). # # Bert Bos # $Id: extractmeta,v 1.3 2006-10-09 18:55:52 ihickson Exp $ use Getopt::Std; #use lib 'bin'; #use utils; $PROG = substr($0, rindex($0, "/") + 1); $USAGE = "Usage: $PROG [-r realname] [file [output]]\n"; # $attvalre matches attribute values; the value is $1.$2.$3 $attvalre = '\s*=\s*(?:(\w+)|\"([^\"]+)\"|\'([^\']+)\')'; # clean replaces non-alphanum characters with '_', and collapses whitespace sub clean { my $r = $_[0]; $r =~ s/^\s+//o; $r =~ s/\s+/ /go; $r =~ s/ $//o; $r =~ tr/a-z/A-Z/; $r =~ s/[^A-z0-9_.-]/_/go; return lc($r); } # urlexpand expands a relative URL to an absolute one sub urlexpand { my ($url, $base) = @_; my $result; if ($url =~ /^\w+:/) { # Already absolute $result = $url; # Keep as is } elsif ($url =~ /^\//o) { # Starts with '/' $base =~ /^\w+:(\/\/[^\/]*\/?)?/o; $result = $&.$url; # Prefix protocol and possibly machine } else { # Starts with path segment $base =~ /[^\/:]*$/; $result = $`.$url; # Prefix everything except last segment $result =~ s/[^\/]*\/\.\.//go; } return $result; } getopts('r:') || die $USAGE; if ($#ARGV >= 0) {$file = $ARGV[0]; shift;} else {$file = '-';} if ($#ARGV >= 0) {$output = $ARGV[0]; shift;} else {$output = '-';} if ($#ARGV >= 0) {die $USAGE;} my $url = defined $opt_r ? $opt_r : $file; open(IN, $file) or die "$PROG: cannot read file $file\n"; open(OUT, ">$output") or die "$PROG: cannot write to file $output\n"; print OUT ' '; print OUT "\n"; $/ = '<'; while () { if (/^link\b/sio) { if (/\brel$attvalre/sio) { my $rel = clean(defined($1) ? $1 : defined($2) ? $2 : $3); if (/\bhref$attvalre/sio) { my $href = urlexpand(defined($1)?$1:defined($2)?$2:$3, $url); print OUT " \n"; if (/\btype$attvalre/sio) { my $type = lc(defined($1) ? $1 : defined($2) ? $2 : $3); print OUT " \n"; } if (/\btitle$attvalre/sio) { my $title = defined($1) ? $1 : defined($2) ? $2 : $3; print OUT " \n"; } if (/\bmedia$attvalre/sio) { my $h = lc(defined($1) ? $1 : defined($2) ? $2 : $3); my @media = split("\w+", $h); foreach my $h (@media) { print OUT " \n"; } } } } } elsif (/^meta\b/sio) { if (/\bcontent$attvalre/sio) { $value = defined($1) ? $1 : defined($2) ? $2 : $3; if (/\b(?:name|http-equiv)$attvalre/sio) { $property = clean(defined($1) ? $1 : defined($2) ? $2 : $3); print OUT " \n"; } } } } print OUT "\n"; close(IN); close(OUT);