#!/usr/bin/perl -s @multitags = ("A", "PRE", "B", "BLINK", "I", "U", "TT", "ADDRESS", "BLOCKQUOTE", "CITE", "CODE", "DFN", "EM", "KBD", "SAMP", "STRONG", "VAR", "FONT", "HEAD", "HTML", "TITLE", "BODY", "TABLE", "TR", "TD", "FORM", "FONT", "FRAMESET", "MAP", "CENTER", "SCRIPT", "NOSCRIPT","OL","UL","DIV","SPAN","SELECT"); @indenttags = ("PRE", "HEAD", "HTML", "BODY", "TABLE", "FORM", "FRAMESET", "MAP", "CENTER", "SCRIPT", "NOSCRIPT", "OL", "UL", "DIV", "SELECT"); @templatetags = ( "foreach", "include", "include-sub", "print", "eval", "if", "ifnot", "ifeval", "else", "elseif", "elseifnot", "elseifeval", ); $templatetagreg = join '|', @templatetags; $onelinenobr = 0; foreach $file (@ARGV) { open(F,"<$file") or die "file not found"; @textin=; close(F); $textin=join('', @textin); $textin = &unwordify($textin); $textout = ""; print STDERR "Preprocess Cleanup...\n"; $textin =~ s!^( |\t)*(.*)( |\t)*$!$2!igm; # $textin =~ s|^ *(.*) *$|$1|igm; # $textin =~ s|^\t*(.*)\t*$|$1|igm; $textin =~ s| +| |gis if $nodoublespace; $indent = ""; @stack = (); print STDERR "Processing...\n"; $notif = 2000000000; $tilen = length($textin); while ($textin ne "") { while ($textin && substr($textin,0,1) eq "\n") { $textout = $textout . "\n" . $indent; $textin = substr($textin, 1); } if (($v=length($textin)) <= $notif) { print STDERR $v . " bytes left to process.\n"; $notif = $v - 1000; } if (substr($textin,0,1) eq "<") { #print STDERR "\$textin: '$textin'\n"; if (substr($textin,1,1) ne "!") { while( $textin =~ s|^(<[^>]*)\n([^>]*>)|$1 $2|s ) {}; $textin =~ m|^(<[^>]+>)(.*)|s; $tag = $1; $textin = $2; $tag =~ s| | |g; # $tag = &rewritetag($tag) unless &in("script", @stack)!=-1; $tag = &rewritetag($tag); $textin = $tag . $textin; } #print STDERR "\$textin: '$textin'\n"; $textin =~ m|^<(/?)([^ >]+)|is; $endtag = ($1 eq "/") || 0; $thistag = $2; #print STDERR "thistag: '$thistag' endtag: $endtag\n"; #print STDERR ">Stack: (" . join(", ", @stack) . ") \$stack[-1]: '$stack[-1]' \$stack[-2]: '$stack[-2]'\n"; if (((lc $thistag) eq "script" || &in("script", @stack)==-1) && &in($thistag, @multitags)>=0) { #print STDERR "It's a multitag.\n"; if (!$endtag) { push @stack, $thistag; } else { if ($thistag eq $stack[-1]) { pop stack; } else { # } elsif(&in("script", @stack)==-1) { print STDERR "Unmatched endtag '/" . $thistag . "' near byte " . ($tilen - length($textin) + 1) . "!\n"; print STDERR " Stack: ", join(", ", @stack) . "\n"; local($contextlen) = 1050; local($context) = substr($textout, -$contextlen) . "==========> " . substr($textin,0,$contextlen); if($nocroak) { $context =~ s| +|...|gs; $context =~ s|\n|\\n|gs; print STDERR " Context: '$context'\n"; } else { $context =~ s|\t| |gs; $context =~ s| +|...|gs; $context =~ s|\n+|\n|gs; print STDERR " Context:\n$context\n"; exit(1) unless $nocroak; } } } } #print STDERR "=0) { #print STDERR "It's an indenttag.\n"; if (!$endtag) { if ($textout !~ m|\n *$| ) { $textout = $textout . "\n" . $indent; } $indent = $indent . " "; $textout .= substr($textin, 0, $i=(index($textin, ">")+1)); $textout = $textout . "\n" . $indent; $textin = substr($textin, $i); $textin = substr($textin, 1) if substr($textin, 0, 1) eq "\n"; } else { #print STDERR "... and it's an endtag\n"; #print STDERR "] \$indent: '$indent' \$tag: '$tag'\n"; #print STDERR "] \$textin[1..25]: '" . substr($textin, 0, 25) . "'\n"; #print STDERR "] \$textout[\$-24..\$]: '" . substr($textout, -25) . "'\n"; $indent = substr($indent, 0, -2); if ($textout !~ m|\n *$| ) { $textout = $textout . "\n" . $indent; } elsif ($textout =~ m|\n +$| ) { $textout =~ s|\n +$|\n$indent|; } $textout .= substr($textin, 0, $i=(index($textin, ">")+1)); $textout = $textout . "\n" . $indent; $textin = substr($textin, $i); $textin = substr($textin, 1) if substr($textin, 0, 1) eq "\n"; #print STDERR "[ \$indent: '$indent' \$tag: '$tag'\n"; #print STDERR "[ \$textin[1..25]: '" . substr($textin, 0, 25) . "'\n"; #print STDERR "[ \$textout[\$-24..\$]: '" . substr($textout, -25) . "'\n"; } } elsif (&in("script", @stack)==-1 && $thistag eq "TR") { #print STDERR "TR tag\n"; if (!$endtag) { if ($compact || $textin =~ m|^[^\n]+|i) { $onelinenobr++; $textout .= substr($textin,0,1); $textin = substr($textin, 1); } else { if ($textout !~ m|\n *$| ) { $textout = $textout . "\n" . $indent; } $indent = $indent . " "; $textout .= substr($textin, 0, $i=(index($textin, ">")+1)); $textout = $textout . "\n" . $indent; $textin = substr($textin, $i); $textin = substr($textin, 1) if substr($textin, 0, 1) eq "\n"; } } else { if ($textout =~ m|")+1)); $textout = $textout . "\n" . $indent; $textin = substr($textin, $i); $textin = substr($textin, 1) if substr($textin, 0, 1) eq "\n"; } } } elsif (&in("script", @stack)==-1 && $thistag eq "TD") { #print STDERR "TD tag\n"; if (!$endtag) { if ($compact || $textin =~ m|^[^\n]+|i) { $onelinenobr++; $textout .= substr($textin,0,1); $textin = substr($textin, 1); } else { if ($textout !~ m|\n *$| ) { $textout = $textout . "\n" . $indent; } $indent = $indent . " "; $textout .= substr($textin, 0, $i=(index($textin, ">")+1)); $textout = $textout . "\n" . $indent; $textin = substr($textin, $i); $textin = substr($textin, 1) if substr($textin, 0, 1) eq "\n"; } } else { if ($textout =~ m|")+1)); $textout = $textout . "\n" . $indent; $textin = substr($textin, $i); $textin = substr($textin, 1) if substr($textin, 0, 1) eq "\n"; } } } elsif (&in("script", @stack)==-1 && $thistag eq "FONT") { #print STDERR "Font tag\n"; if (!$endtag) { if ($compact || $stack[-2] eq "A" || $textin =~ m|^[^\n]+|i) { $onelinenobr++; $textout .= substr($textin,0,1); $textin = substr($textin, 1); } else { if ($textout !~ m|\n *$| ) { $textout = $textout . "\n" . $indent; } $indent = $indent . " "; $textout .= substr($textin, 0, $i=(index($textin, ">")+1)); $textout = $textout . "\n" . $indent; $textin = substr($textin, $i); $textin = substr($textin, 1) if substr($textin, 0, 1) eq "\n"; } } else { if ($stack[-1] eq "A" || $textout =~ m|")+1)); $textout = $textout . "\n" . $indent; $textin = substr($textin, $i); $textin = substr($textin, 1) if substr($textin, 0, 1) eq "\n"; } } } elsif (!$onelinenobr && !$nobr && &in("script", @stack)==-1 && $thistag eq "BR" && $textin !~ m|^
")+1) . "\n"; #print STDERR "substr='" . substr($textin, 0, $i=(index($textin, ">")+1)) . "'\n"; $textout .= substr($textin, 0, $i=(index($textin, ">")+1)); $textout = $textout . "\n" . $indent; $textin = substr($textin, $i); $textin = substr($textin, 1) if substr($textin, 0, 1) eq "\n"; } else { $textout .= substr($textin,0,1); $textin = substr($textin, 1); } } elsif ($textin =~ m|^\{| && (&in("script", @stack)>=0)) { #print STDERR "\$textin: '$textin'\n"; #print STDERR "found { in script.\n"; $indent = $indent . " "; if ($textout !~ m|[ \n]$| ) { $textout .= " "; } $textout .= substr($textin,0,1); $textout = $textout . "\n" . $indent; $textin = substr($textin, 1); $textin = substr($textin, 1) if substr($textin, 0, 1) eq "\n"; } elsif ($textin =~ m|^\}| && (&in("script", @stack)>=0)) { $indent = substr($indent, 0, -2); if ($textout !~ m|\n *$| ) { $textout = $textout . "\n" . $indent; } elsif ($textout =~ m|\n +$| ) { $textout =~ s|\n +$|\n$indent|; } $textout .= substr($textin,0,1); if ($textin !~ m|^\} *[^\n]|) { $textout = $textout . "\n" . $indent; } $textin = substr($textin, 1); $textin = substr($textin, 1) if substr($textin, 0, 1) eq "\n"; } else { $textout .= substr($textin,0,1); $textin = substr($textin, 1); } while ($textin ne "" && substr($textin,0,1) eq "\n") { $textout = $textout . "\n" . $indent; $textin = substr($textin, 1); } } if(@stack) { print STDERR "Unclosed tags remaining!\n"; print STDERR " Stack: ", join(", ", @stack) . "\n"; exit(1) unless $nocroak; } print STDERR "Postprocess Cleanup...\n"; $textout =~ s|^\n+||igs; $textout =~ s|\n+$|\n|igs; print STDERR "Postpostprocess Cleanup...\n"; $textout =~ s|\n +\n|\n|igs; $textout =~ s|\n+|\n|igs; $textout =~ s| +\n|\n|igs; print STDERR "Postpostpostprocess Cleanup...\n"; if(!$xml) { $textout =~ s|\"(#[0-9a-f]+)\"|uc($1)|egis; $textout =~ s|\\\"(#[0-9a-f]+)\\\"|uc($1)|egis; } print STDERR "Postpostpostpostprocess Cleanup...\n"; if(!$xml) { $textout =~ s| align=\"left\"| ALIGN=LEFT|gis; $textout =~ s| align=\"right\"| ALIGN=RIGHT|gis; $textout =~ s| align=\"center\"| ALIGN=CENTER|gis; $textout =~ s| align=\"middle\"| ALIGN=CENTER|gis; $textout =~ s| align=middle| ALIGN=CENTER|gis; $textout =~ s| valign=\"top\"| VALIGN=TOP|gis; $textout =~ s| valign=\"bottom\"| VALIGN=BOTTOM|gis; $textout =~ s| valign=\"center\"| VALIGN=CENTER|gis; $textout =~ s| valign=\"middle\"| VALIGN=CENTER|gis; $textout =~ s| valign=middle| VALIGN=CENTER|gis; } if($inplace) { rename($file,"$file.bak") unless $nobak; open(F,">$file") or die "$!"; print F $textout; close(F); } else { print $textout; } } sub rewritetag { local($tag) = @_; return $tag if $template && ($tag =~ /^(.*?)(<(\/?)($templatetagreg)( ([^>]*?))?(\/?)>)(\n?)(.*)$/si); #print STDERR "rewritetag(\"$tag\")\n"; $tag =~ s|^<(.*)>$|$1|is; $tag =~ m|^([^ ]+) ?(.*)$|is; $tagmeat = $2; $tag = uc($1); $newtag = "<$tag"; $key = ""; $value = ""; $find = "="; while ($tagmeat ne "") { #print STDERR "> tagmeat: '$tagmeat'\n"; #print STDERR "\$find: '$find' \$key: '$key' \$value: '$value'\n"; if (substr($tagmeat,0, length($find)) eq $find) { #print STDERR "found '$find'\n"; if ($find eq "=") { $find = (substr($tagmeat,1,2) eq "\\\"") ? "\\\"" : ((substr($tagmeat,1,1) eq "\"") ? "\"" : ((substr($tagmeat,1,1) eq "'") ? "'" : " ")); $value = substr($tagmeat,1, length($find)); $tagmeat = substr($tagmeat, length($find)); } else { if ($value !~ m|\\$| ) { if ($find ne " ") { $value = $value . substr($tagmeat,0, length($find)); } #print STDERR "-Stack: (" . join(", ", @stack) . ") \$stack[-1]: '$stack[-1]' \$stack[-2]: '$stack[-2]'\n"; #print STDERR ">\$key: '$key' \$value: '$value'\n"; if ($value =~ m|^'(.*)'$|) { if(&in("script", @stack)!=-1) { $value = "\\\"" . $1 . "\\\""; } else { $value = "\"" . $1 . "\""; } } elsif ( ($xml || ($value !~ m|^-?[0-9]+$|)) && ($value !~ m|^\".*\"$|) && ($value !~ m|^\\\".*\\\"$|)) { if(&in("script", @stack)!=-1) { $value = "\\\"" . $value . "\\\""; } else { $value = "\"" . $value . "\""; } } elsif ($value =~ m|^\"([0-9]+)\"$|) { $value = $1; } #print STDERR "<\$key: '$key' \$value: '$value'\n"; $newtag = $newtag . " " . uc($key) . "=" . $value; $key = ""; $value = ""; $tagmeat = substr($tagmeat, length($find)); $find = "="; while ($tagmeat ne "" && (substr($tagmeat,0,1) eq " ")) { $tagmeat = substr($tagmeat, 1); } $tagmeat = " " . $tagmeat; } else { $value .= substr($tagmeat, 0, length($find)); } } } elsif ($find eq "=" && substr($tagmeat,0, length($find)) eq " ") { $newtag = $newtag . " " . uc($key); $key = ""; $value = ""; $tagmeat = substr($tagmeat, length($find)); $find = "="; while ($tagmeat ne "" && (substr($tagmeat,0,1) eq " ")) { $tagmeat = substr($tagmeat, 1); } $tagmeat = " " . $tagmeat; } elsif ($find eq "=") { $key .= substr($tagmeat,0,1); } else { $value .= substr($tagmeat,0,1); } $tagmeat = substr($tagmeat, 1); #print STDERR "< tagmeat: '$tagmeat'\n"; } if($key ne "" && $value ne "") { #print STDERR "-Stack: (" . join(", ", @stack) . ") \$stack[-1]: '$stack[-1]' \$stack[-2]: '$stack[-2]'\n"; #print STDERR ">\$key: '$key' \$value: '$value'\n"; if ($value =~ m|^'(.*)'$|) { if(&in("script", @stack)!=-1) { $value = "\\\"" . $1 . "\\\""; } else { $value = "\"" . $1 . "\""; } } elsif ( ($xml || ($value !~ m|^-?[0-9]+$|)) && ($value !~ m|^\".*\"$|) && ($value !~ m|^\\\".*\\\"$|)) { if(&in("script", @stack)!=-1) { $value = "\\\"" . $value . "\\\""; } else { $value = "\"" . $value . "\""; } } #print STDERR "<\$key: '$key' \$value: '$value'\n"; $newtag = $newtag . " " . uc($key) . "=" . $value; } elsif($key ne "" && $find eq "=") { $newtag = $newtag . " " . uc($key); } $newtag .= ">"; #print STDERR "rewritetag => '$newtag'\n"; return $newtag; } sub in { local($val, @list) = @_; for($i=0; $i<=$#list; $i++) { return $i if (uc($list[$i]) eq uc($val)); } return -1; } # Translate stupid word characters sub unwordify { my($todecode) = @_; $todecode =~ s/\xAE/\®\;/gs; $todecode =~ s/\xA9/\©\;/gs; $todecode =~ s/\x99/\&\#153\;/gs; $todecode =~ s/\x91/'/gs; $todecode =~ s/\x92/'/gs; $todecode =~ s/\xB4/'/gs; $todecode =~ s/\x93/\"/gs; $todecode =~ s/\x94/\"/gs; $todecode =~ s/\"\;/\"/gs; $todecode =~ s/\xBA/\°\;/gs; $todecode =~ s/\x82/,/gs; $todecode =~ s/\x85/.../gs; $todecode =~ s/\x84/,,/gs; $todecode =~ s/\x96/--/gs; $todecode =~ s/\x97/---/gs; return $todecode; }