|
| 1 | +#!/usr/local/bin/perl -w |
| 2 | +# Extract examples from an HTML document |
| 3 | +# |
| 4 | +# extract the content of the blocks: |
| 5 | +# <div class="example"> ... </div> |
| 6 | +# and put their contents into separate files |
| 7 | +# |
| 8 | +# Arnaud Le Hors - lehors@w3.org |
| 9 | +# $Id: xextr,v 1.1 1997-12-29 21:10:46 ijacobs Exp $ |
| 10 | + |
| 11 | +$PROGNAME = substr($0, rindex($0, "/") + 1); |
| 12 | + |
| 13 | +if (!$ARGV[0]) { |
| 14 | + print "Usage: $PROGNAME file [tgt_dir]\n"; |
| 15 | + exit 1; |
| 16 | +} |
| 17 | + |
| 18 | +# copy file in memory |
| 19 | +if (!open(INPUT, $ARGV[0])) { |
| 20 | + print "$PROGNAME Error: Cannot open file: $ARGV[0]\n"; |
| 21 | + exit 1; |
| 22 | +} |
| 23 | +$buf = ""; |
| 24 | +while (<INPUT>) { |
| 25 | + $buf .= $_; |
| 26 | +} |
| 27 | +close(INPUT); |
| 28 | + |
| 29 | +$path = ""; |
| 30 | +if ($ARGV[1]) { |
| 31 | + $path = "$ARGV[1]"; |
| 32 | +} |
| 33 | + |
| 34 | +@paths = split(/\//, $ARGV[0]); |
| 35 | +$path .= "/" . $paths[$#paths] . "_xampl"; |
| 36 | + |
| 37 | +sub process_html_block { |
| 38 | + my ($file, $block, $deprecated) = @_; |
| 39 | + |
| 40 | + # comment out undesired block |
| 41 | + $block =~ s/<em>(.*?)<\/em>/<!-- $1 -->/sigo; |
| 42 | + |
| 43 | + # map characters |
| 44 | + $block =~ s/</</g; |
| 45 | + $block =~ s/>/>/g; |
| 46 | + $block =~ s/"/\"/g; |
| 47 | + $block =~ s/©/(C)/g; |
| 48 | + $block =~ s/&/\&/g; |
| 49 | + |
| 50 | + # make sure it does form a complete document |
| 51 | + if (!($_ = $block, /<!doctype/i)) { |
| 52 | + if (($_ = $block, /frameset/i)) { |
| 53 | + $doctype = |
| 54 | + "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Frameset//EN\">"; |
| 55 | + } elsif ($deprecated || ($_ = $block, /noframe|iframe/i)) { |
| 56 | + $doctype = |
| 57 | + "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">"; |
| 58 | + } else { |
| 59 | + $doctype = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0//EN\">"; |
| 60 | + if (!($_ = $block, /<head>/i) && |
| 61 | + (substr($block, 0, 1) ne "<" || |
| 62 | + substr($block, 0, 2) eq "<!" || |
| 63 | + substr($block, 0, 2) eq "<?")) { |
| 64 | + $block = "<p>\n$block"; |
| 65 | + } |
| 66 | + } |
| 67 | + $block = "$doctype\n$block"; |
| 68 | + } |
| 69 | + if (!($_ = $block, /<title>/i)) { |
| 70 | + if (!($_ = $block, /<head>/i)) { |
| 71 | + $block =~ s/<!doctype.*?>/$&\n<title>example<\/title>/i; |
| 72 | + } else { |
| 73 | + $block =~ s/<head>/<head>\n<title>example<\/title>/i; |
| 74 | + } |
| 75 | + } |
| 76 | + |
| 77 | + # print file out |
| 78 | + print "\tHTML extracting$file\n"; |
| 79 | + open(OUTPUT, "> $file") || die "failed to create $file\n"; |
| 80 | + print OUTPUT $block; |
| 81 | + close(OUTPUT); |
| 82 | +} |
| 83 | + |
| 84 | +sub process_xml_block { |
| 85 | + warn "XML extraction not implemented yet...\n"; |
| 86 | +} |
| 87 | + |
| 88 | +sub process_css_block { |
| 89 | + warn "CSS extraction not implemented yet...\n"; |
| 90 | +} |
| 91 | + |
| 92 | + |
| 93 | +# Extract different types of blocks: css, html, xml. |
| 94 | +# If the type is "html" or "xml" at the DIV level, |
| 95 | +# assume that *all* pre elements within are "html" or |
| 96 | +# "xml" respectively. If the type is unknown, |
| 97 | +# assume css, but allow internal pre elements to |
| 98 | +# override this. |
| 99 | + |
| 100 | +sub get_lang { |
| 101 | + $pattern =$_[0]; |
| 102 | + if ( $pattern =~ /html/io ){ |
| 103 | + return "html"; |
| 104 | + } elsif ( $pattern =~ /xml/io ){ |
| 105 | + return "xml"; |
| 106 | + } else { |
| 107 | + return "css"; |
| 108 | + } |
| 109 | +} |
| 110 | + |
| 111 | +sub process_pre_block { |
| 112 | + ($preclass, $preblock) = @_; |
| 113 | + $lang = get_lang($preclass); |
| 114 | + $output = "$path$num.$subnum.$lang"; |
| 115 | + $deprecated = ($preclass =~ /deprecated/io) ? 1 : 0; |
| 116 | + if ($lang eq "html") { |
| 117 | + process_html_block("$output", $preblock, $deprecated); |
| 118 | + } elsif ($lang eq "xml") { |
| 119 | + process_xml_block("$output", $preblock, $deprecated); |
| 120 | + } elsif ($lang eq "css") { |
| 121 | + process_css_block("$output", $preblock, $deprecated); |
| 122 | + } else { |
| 123 | + warn "Unknown language $lang"; |
| 124 | + } |
| 125 | +} |
| 126 | + |
| 127 | + |
| 128 | + |
| 129 | +$preblockst = "(?:<pre>|(?:<pre\\s+class\\s*=\\s*\"?(example|deprecated-example|html-example|deprecated-html-example|xml-example|deprecated-xml-example)\"?\\s*>))\n?"; |
| 130 | +$preblocket = "\n?<\/pre>"; |
| 131 | + |
| 132 | +# Arguments: |
| 133 | +# 0: DIV class value |
| 134 | +# 1: DIV content |
| 135 | +sub process_pre_blocks { |
| 136 | + ($divclass, $divcontent) = @_; |
| 137 | + while ($divcontent =~ /$preblockst(.*?)$preblocket/iso) { |
| 138 | + $subnum++; |
| 139 | + process_pre_block(($1 || $divclass), $2); |
| 140 | + $divcontent = $' |
| 141 | + } |
| 142 | +} |
| 143 | + |
| 144 | +$divblockst = "<(div|pre)\\s+class\\s*=\\s*\"?(example|deprecated-example|html-example|deprecated-html-example|xml-example|deprecated-xml-example)\"?\\s*>\n?"; |
| 145 | +$divblocket = "\n?<\/\\1>"; |
| 146 | + |
| 147 | +$_ = $buf; |
| 148 | +$num = 0; |
| 149 | +$subnum = 0; |
| 150 | + |
| 151 | +while (/$divblockst(.*?)$divblocket/iso) { |
| 152 | + $continue = $'; |
| 153 | + $element = $1; |
| 154 | + $class = $2; |
| 155 | + $block = $3; |
| 156 | + $num++; |
| 157 | + $subnum = 0; |
| 158 | + # If DIV, process internal PRE blocks, |
| 159 | + # otherwise, process PRE block directly. |
| 160 | + if ($element =~ /div/io) { |
| 161 | + process_pre_blocks($class, $block); |
| 162 | + } else { |
| 163 | + process_pre_block($class, $block); |
| 164 | + } |
| 165 | + $_ = $continue; |
| 166 | +} |
0 commit comments