-
Notifications
You must be signed in to change notification settings - Fork 791
Expand file tree
/
Copy pathextractmeta
More file actions
131 lines (117 loc) · 3.55 KB
/
extractmeta
File metadata and controls
131 lines (117 loc) · 3.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/local/bin/perl -w
#
# Extract info from LINK and META and put them in an RDF file
#
# Reads from 'file' (default stdin) and writes to 'output'
# (default stdout). The URL of the input is assumed to be URL
# (default 'file').
#
# Bert Bos <bert@w3.org>
# $Id: extractmeta,v 1.3 2006-10-09 18:55:52 ihickson Exp $
use Getopt::Std;
#use lib 'bin';
#use utils;
$PROG = substr($0, rindex($0, "/") + 1);
$USAGE = "Usage: $PROG [-r realname] [file [output]]\n";
# $attvalre matches attribute values; the value is $1.$2.$3
$attvalre = '\s*=\s*(?:(\w+)|\"([^\"]+)\"|\'([^\']+)\')';
# clean replaces non-alphanum characters with '_', and collapses whitespace
sub clean {
my $r = $_[0];
$r =~ s/^\s+//o;
$r =~ s/\s+/ /go;
$r =~ s/ $//o;
$r =~ tr/a-z/A-Z/;
$r =~ s/[^A-z0-9_.-]/_/go;
return lc($r);
}
# urlexpand expands a relative URL to an absolute one
sub urlexpand {
my ($url, $base) = @_;
my $result;
if ($url =~ /^\w+:/) { # Already absolute
$result = $url; # Keep as is
} elsif ($url =~ /^\//o) { # Starts with '/'
$base =~ /^\w+:(\/\/[^\/]*\/?)?/o;
$result = $&.$url; # Prefix protocol and possibly machine
} else { # Starts with path segment
$base =~ /[^\/:]*$/;
$result = $`.$url; # Prefix everything except last segment
$result =~ s/[^\/]*\/\.\.//go;
}
return $result;
}
getopts('r:') || die $USAGE;
if ($#ARGV >= 0) {$file = $ARGV[0]; shift;} else {$file = '-';}
if ($#ARGV >= 0) {$output = $ARGV[0]; shift;} else {$output = '-';}
if ($#ARGV >= 0) {die $USAGE;}
my $url = defined $opt_r ? $opt_r : $file;
open(IN, $file) or die "$PROG: cannot read file $file\n";
open(OUT, ">$output") or die "$PROG: cannot write to file $output\n";
print OUT '<!--
The database schema is defined as follows:
- there is one table, called "meta"
- it has three columns called "href", "property", and "value"
- "href" is a URL
- "property" is a keyword
- "value" is a string
The meaning of a record, paraphrased in English, is:
"the _property_ property of the resource at _href_ has value _value_"
There are further constraints on the value that depend on the property,
but which are not spelled out here.
-->
';
print OUT "<RDF schema=\"http://www.w3.org/TR/REC-ACL\">\n";
$/ = '<';
while (<IN>) {
if (/^link\b/sio) {
if (/\brel$attvalre/sio) {
my $rel = clean(defined($1) ? $1 : defined($2) ? $2 : $3);
if (/\bhref$attvalre/sio) {
my $href = urlexpand(defined($1)?$1:defined($2)?$2:$3, $url);
print OUT " <meta
href=\"$url\"
property=\"$rel\"
value=\"$href\"/>\n";
if (/\btype$attvalre/sio) {
my $type = lc(defined($1) ? $1 : defined($2) ? $2 : $3);
print OUT " <meta
href=\"$href\"
property=\"content-type\"
value=\"$type\"/>\n";
}
if (/\btitle$attvalre/sio) {
my $title = defined($1) ? $1 : defined($2) ? $2 : $3;
print OUT " <meta
href=\"$href\"
property=\"title\"
value=\"$title\"/>\n";
}
if (/\bmedia$attvalre/sio) {
my $h = lc(defined($1) ? $1 : defined($2) ? $2 : $3);
my @media = split("\w+", $h);
foreach my $h (@media) {
print OUT " <meta
href=\"$href\"
property=\"medium\"
value=\"$h\"/>\n";
}
}
}
}
} elsif (/^meta\b/sio) {
if (/\bcontent$attvalre/sio) {
$value = defined($1) ? $1 : defined($2) ? $2 : $3;
if (/\b(?:name|http-equiv)$attvalre/sio) {
$property = clean(defined($1) ? $1 : defined($2) ? $2 : $3);
print OUT " <meta
href=\"$url\"
property=\"$property\"
value=\"$value\"/>\n";
}
}
}
}
print OUT "</RDF>\n";
close(IN);
close(OUT);