Source code of Windows XP (NT5)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

727 lines
16 KiB

  1. #!perl
  2. =head1 NAME
  3. Htmlcln - Sanitizes HTML and related files before publication
  4. =head1 SYNOPSIS
  5. perl htmlcln.pl [-t [html|js|css]] [-DVAR[=value] ...] [-v]
  6. -o I<outfile> I<infile>
  7. =head1 DESCRIPTION
  8. The Htmlcln program preprocesses text files such as HTML, JS, or CSS
  9. files and cleans them up.
  10. =over 8
  11. Comments are removed.
  12. Blank lines are removed.
  13. Sections marked "debug" are removed in the retail build.
  14. =back
  15. =head1 OPTIONS
  16. =over 8
  17. =item B<-t> [html|js|css]
  18. Htmlcln normally tries to guess what kind of file it is processing from
  19. the filename extension. You can explicitly override the guess with the
  20. B<-t> command line switch.
  21. =item B<-D>VAR[=value] ...
  22. Command line definitions are supported in the same manner as the C compiler.
  23. The only command-line variable we pay attention to is the -DDBG flag, which
  24. indicates that this is a debug build.
  25. =item B<-o> I<outfile>
  26. Specifies the name of the output file.
  27. =item I<srcfile>
  28. Specifies the name of the source file.
  29. =back
  30. =cut
  31. use strict qw(vars refs subs);
  32. ##############################################################################
  33. #
  34. # Element - A class that parses HTML elements
  35. #
  36. # Instance variables:
  37. #
  38. # raw = the raw text
  39. # attr = hash of attributes
  40. # tag = name of tag, including slash
  41. #
  42. # If the value of an attribute hash is undef, it means that the
  43. # attribute is present but with no value.
  44. #
  45. package Element;
  46. #
  47. # Constructor: $elem = new Element("<TABLE BORDER>");
  48. #
  49. sub new {
  50. my ($class, $raw) = @_;
  51. my $attr = { };
  52. my $self = { raw => $raw, attr => $attr };
  53. my $tag;
  54. if ($raw =~ s/^<([^\s>]*)//) {
  55. $self->{tag} = uc $1;
  56. if ($self->{tag} =~ /^[A-Z]/) {
  57. $raw =~ s/>$//;
  58. for (;;) {
  59. if ($raw =~ s/^\s*([-A-Za-z]+)="([^"]*)"// ||
  60. $raw =~ s/^\s*([-A-Za-z]+)='([^']*)'// ||
  61. $raw =~ s/^\s*([-A-Za-z]+)=(\S*)//) {
  62. $attr->{uc $1} = $2;
  63. } elsif ($raw =~ s/^\s*([A-Za-z]+)//) {
  64. $attr->{uc $1} = undef;
  65. } else {
  66. last;
  67. }
  68. }
  69. }
  70. } else {
  71. warn "Can't parse \"$raw\"";
  72. }
  73. bless $self, $class;
  74. }
  75. #
  76. # Element::Tag
  77. #
  78. # Returns the tag.
  79. #
  80. sub Tag {
  81. my $self = shift;
  82. $self->{tag};
  83. }
  84. #
  85. # Element::Attr
  86. #
  87. # Returns the value of the attribute.
  88. #
  89. sub Attr {
  90. my ($self, $attr) = @_;
  91. $self->{attr}{uc $attr};
  92. }
  93. #
  94. # Element::Exists
  95. #
  96. # Returns the presence of the attribute.
  97. #
  98. sub Exists {
  99. my ($self, $attr) = @_;
  100. exists $self->{attr}{uc $attr};
  101. }
  102. ##############################################################################
  103. #
  104. # Filter base class
  105. #
  106. # Basic stuff to save people some hassle.
  107. #
  108. # Per perl tradition, an object is a ref to an anonymous hash where the
  109. # state is kept.
  110. #
  111. # Instance variables:
  112. #
  113. # sink = reference to filter sink
  114. #
  115. package Filter;
  116. sub new {
  117. my($class) = @_;
  118. bless { }, $class;
  119. }
  120. sub SetSink {
  121. my ($self, $sink) = @_;
  122. $self->{sink} = $sink;
  123. }
  124. sub Add {
  125. my $self = shift;
  126. $self->{sink}->Add(@_);
  127. }
  128. sub Flush { }
  129. sub Close {
  130. my $self = shift;
  131. $self->{sink}->Close(@_);
  132. }
  133. sub SinkAdd {
  134. my $self = shift;
  135. $self->{sink}->Add(@_);
  136. }
  137. ##############################################################################
  138. #
  139. # TokenFilter filter package
  140. #
  141. # Does not modify the stream, but merely chops them into tokens, as
  142. # recognized by NextToken and processed by EachToken.
  143. #
  144. # Instance data:
  145. #
  146. # buf = unprocessed text
  147. #
  148. package TokenFilter;
  149. @TokenFilter::ISA = qw(Filter);
  150. #
  151. # Append the incoming text to the buffer, then suck out entire tokens.
  152. #
  153. sub Add {
  154. my($self, $text) = @_;
  155. my $tok;
  156. $self->{buf} .= $text;
  157. while ($self->{buf} ne '' && defined($tok = $self->NextToken))
  158. {
  159. $self->EachToken($tok);
  160. }
  161. }
  162. sub Flush {
  163. my $self = shift;
  164. $self->EachToken($self->{buf});
  165. }
  166. #
  167. # By default, we just sink tokens to the next layer.
  168. #
  169. sub EachToken {
  170. my($self, $tok) = @_;
  171. $self->SinkAdd($tok);
  172. }
  173. ##############################################################################
  174. #
  175. # LineFilter filter package
  176. #
  177. # Tokenizer that recognizes lines.
  178. #
  179. # Instance data:
  180. #
  181. # buf = unprocessed text
  182. #
  183. package LineFilter;
  184. @LineFilter::ISA = qw(TokenFilter);
  185. #
  186. # Recognize lines.
  187. #
  188. sub NextToken {
  189. my($self) = shift;
  190. if ($self->{buf} =~ s/([^\n]*\n)//) {
  191. $1;
  192. } else {
  193. undef;
  194. }
  195. }
  196. ##############################################################################
  197. #
  198. # WhitespaceFilter filter package
  199. #
  200. # Removes blank lines and removes leading and trailing whitespace.
  201. #
  202. # Someday: Collapse multiple whitespace outside of quotation marks.
  203. #
  204. package WhitespaceFilter;
  205. @WhitespaceFilter::ISA = qw(LineFilter);
  206. sub EachToken {
  207. my($self, $line) = @_;
  208. $line =~ s/^[ \t]+//;
  209. $line =~ s/[ \t]+$//;
  210. $self->SinkAdd($line) unless $line =~ /^$/;
  211. }
  212. ##############################################################################
  213. #
  214. # OutFile filter package
  215. #
  216. # Writes its output to a file.
  217. #
  218. # Instance data:
  219. #
  220. # fh = name of file handle
  221. #
  222. #
  223. package OutFile;
  224. @OutFile::ISA = qw(Filter);
  225. no strict 'refs'; # Our filename globs aren't very strict
  226. #
  227. # Custom method: SetOutput. Opens an output file.
  228. #
  229. my $seq = 0;
  230. sub SetOutput {
  231. my($self, $file) = @_;
  232. $self->{fh} = "OutFile" . $seq++;
  233. open($self->{fh}, ">$file") || die "Unable to open $file for writing ($!)\n";
  234. }
  235. sub Add {
  236. my $self = shift;
  237. print { $self->{fh} } @_;
  238. }
  239. sub Close {
  240. my $self = shift;
  241. close($self->{fh});
  242. }
  243. ##############################################################################
  244. #
  245. # DebugFilter filter package
  246. #
  247. # Filters out ;debug and ;begin_debug blocks if building retail.
  248. #
  249. # Instance data:
  250. #
  251. # skip = nonzero if we are inside an ignored ;begin_debug block
  252. # buf = unprocessed text
  253. #
  254. package DebugFilter;
  255. @DebugFilter::ISA = qw(LineFilter);
  256. no strict 'refs'; # Our filename globs aren't very strict
  257. #
  258. # See if the line contains a debug marker.
  259. # If applicable, send the line down the chain.
  260. #
  261. sub EachToken {
  262. my($self, $line) = @_;
  263. # ;begin_debug means start skipping if retail
  264. if ($line =~ s/;begin_debug//) {
  265. $self->{skip} = $::RetailVersion;
  266. }
  267. # If we were skipping, then ;end_debug ends skipping and we should eat it
  268. if ($line =~ s/;end_debug// && $self->{skip}) {
  269. $self->{skip} = 0;
  270. } elsif ($line =~ s/;debug// && $::RetailVersion) {
  271. # A one-shot debug line in retail - skip it
  272. } elsif (!$self->{skip}) {
  273. $self->SinkAdd($line); # send it down the chain
  274. }
  275. }
  276. ##############################################################################
  277. #
  278. # CPP filter package
  279. #
  280. # The CPP filter performs the following operations:
  281. #
  282. # Removes C and C++-style comments.
  283. #
  284. # Filters whitespace.
  285. #
  286. # Instance data:
  287. #
  288. # buf = unprocessed text
  289. # wsf = child WhitespaceFilter
  290. # script = current script sink
  291. # ultSink = the ultimate sink
  292. package CPP;
  293. @CPP::ISA = qw(TokenFilter);
  294. sub new {
  295. my($class) = shift;
  296. my $self = new Filter;
  297. $self->{wsf} = new WhitespaceFilter; # sink into a whitespace filter
  298. $self->{sink} = $self->{wsf}; # initially use this script
  299. bless $self, $class;
  300. }
  301. #
  302. # Recognize tokens, which are lines or /* ... */ comments.
  303. #
  304. sub NextToken {
  305. my($self) = shift;
  306. if ($self->{buf} =~ s/^([^\/]+)//) { # eat up to a slash
  307. $1;
  308. } elsif ($self->{buf} =~ s/^\/\/.*?\n//) { # eat // to end of line
  309. "\n";
  310. } elsif ($self->{buf} =~ s/^\/\*[^\0]*?\*\///) { # eat /* .. */
  311. '';
  312. } elsif ($self->{buf} =~ s/^(\/)(?=[^\/\*])//) { # eat / not followed by / or *
  313. $1;
  314. } else { # incomplete fragment - stop
  315. undef;
  316. }
  317. }
  318. #
  319. # SetSink
  320. #
  321. # The sink we get is really the whitespace filter's sink, and we sink
  322. # into the whitespace filter.
  323. #
  324. sub SetSink {
  325. my ($self, $sink) = @_;
  326. $self->{wsf}->SetSink($sink);
  327. }
  328. ##############################################################################
  329. #
  330. #
  331. # JS - comments are // or /* ... */, invoked via <SCRIPT>...
  332. # CSS - comments are /* ... */, invoked via <STYLE TYPE="text/css">
  333. #
  334. # They are both just CPP thingies. Both should someday remove whitespace
  335. package JS;
  336. @JS::ISA = qw(CPP);
  337. package CSS;
  338. @CSS::ISA = qw(CPP);
  339. ##############################################################################
  340. #
  341. # HTML filter package
  342. #
  343. # The HTML filter performs the following operations:
  344. #
  345. # Send the final output through a whitespace filter.
  346. #
  347. # Remove comments.
  348. #
  349. # Someday it will also...
  350. #
  351. # Recognize embedded stylesheets and scripts and generate a subfilter
  352. # to handle them.
  353. #
  354. # Compress spaces outside quotation marks.
  355. #
  356. # Instance data:
  357. #
  358. # buf = unprocessed text
  359. # wsf = child WhitespaceFilter
  360. # script = current script sink
  361. # endScript = sub that recognizes end of script
  362. # ultSink = the ultimate sink
  363. package HTML;
  364. @HTML::ISA = qw(TokenFilter);
  365. sub new {
  366. my($class) = shift;
  367. my $self = new Filter;
  368. $self->{wsf} = new WhitespaceFilter;
  369. $self->{sink} = $self->{wsf}; # initially use this script
  370. bless $self, $class;
  371. }
  372. #
  373. # SetSink
  374. #
  375. # The sink we get is really the whitespace filter's sink, and we sink
  376. # into the whitespace filter.
  377. #
  378. sub SetSink {
  379. my ($self, $sink) = @_;
  380. $self->{ultSink} = $sink;
  381. $self->{wsf}->SetSink($sink);
  382. }
  383. #
  384. # NextHTMLToken
  385. #
  386. # An HTML token is one of the following:
  387. #
  388. # - A hunk of boring text.
  389. # - A comment (thrown away).
  390. # - A matched <...> thingie.
  391. sub NextHTMLToken {
  392. my($self) = shift;
  393. #
  394. # Any string of non "<" counts as a boring text token.
  395. #
  396. # Be careful not to mistake <!DOCTYPE...> as a comment.
  397. #
  398. if ($self->{buf} =~ s/^([^<]+)//) {
  399. $1;
  400. } elsif ($self->{buf} =~ s/^(<!--[^\0]*?-->)//) { # Eat full comments
  401. '';
  402. } elsif ($self->{buf} =~ s/^(<![^-][^>]*>)//) { # <!DOCTYPE ...>
  403. $1;
  404. } elsif ($self->{buf} =~ s/^(<[^!][^>]*>)//) { # <something else>
  405. $1;
  406. } else { # incomplete fragment - stop
  407. undef;
  408. }
  409. }
  410. #
  411. # NextScriptToken
  412. #
  413. # A script token is anything that isn't the word </SCRIPT>.
  414. #
  415. sub NextScriptToken
  416. {
  417. my($self) = shift;
  418. if ($self->{buf} =~ s,^(</SCRIPT>),,i) {
  419. $1;
  420. } elsif ($self->{buf} =~ s,^(.*?)</SCRIPT>,,i) {
  421. $1;
  422. } else {
  423. my $tok = $self->{buf};
  424. $self->{buf} = '';
  425. $tok;
  426. }
  427. }
  428. #
  429. # NextToken
  430. #
  431. # Returns either an HTML token or a script token.
  432. #
  433. sub NextToken {
  434. my($self) = shift;
  435. if (defined $self->{script}) {
  436. $self->NextScriptToken();
  437. } else {
  438. $self->NextHTMLToken();
  439. }
  440. }
  441. #
  442. # _Redirect - Private method that redirects parsing to a script language.
  443. #
  444. # $self->_Redirect($scr, $end);
  445. #
  446. # $scr = script object to hook in
  447. # $end = sub that recognizes the end of the script
  448. #
  449. #
  450. sub _Redirect {
  451. my ($self, $scr, $end) = @_;
  452. $self->{script} = $self->{sink} = $scr;
  453. $scr->SetSink($self->{ultSink});
  454. $self->{endScript} = $end;
  455. }
  456. sub EachToken {
  457. my($self, $tok) = @_;
  458. if ($tok =~ /^<SCRIPT/i) {
  459. $self->{inScript} = 1; # BUGBUG create a script sink
  460. my $elem = new Element($tok);
  461. my $lang = lc $elem->Attr("LANGUAGE");
  462. my $scr;
  463. # No language implies JScript
  464. if (!defined($lang) || $lang eq 'jscript' || $lang eq 'javascript') {
  465. $scr = new CPP;
  466. } else {
  467. warn "Unknown script language [$lang]";
  468. # Just use the whitespace filter as the unknown script filter
  469. $scr = new WhitespaceFilter;
  470. }
  471. $self->_Redirect($scr, sub { m,^</SCRIPT>,i });
  472. } elsif ($tok =~ /<STYLE/i) {
  473. $self->_Redirect(new CSS, sub { m,^</STYLE>,i });
  474. } elsif (defined($self->{endScript}) && &{$self->{endScript}}($tok)) {
  475. delete $self->{endScript};
  476. $self->{script}->Flush();
  477. delete $self->{script};
  478. $self->{sink} = $self->{wsf};
  479. }
  480. $self->SinkAdd($tok);
  481. }
  482. ##############################################################################
  483. #
  484. # Main package
  485. #
  486. package main;
  487. #
  488. # Set up some defaults.
  489. #
  490. my $force_type = undef; # do not force file type
  491. $::RetailVersion = 1; # not the debugging version
  492. my $outfile = undef; # output file not known yet
  493. my %VAR = (); # No variables defined yet
  494. my $verbose = undef; # not verbose mode
  495. ##############################################################################
  496. #
  497. # CreateTypeFilter - Create a filter for the specified type.
  498. #
  499. my $types = {
  500. html => sub { new HTML }, # HTML
  501. htm => sub { new HTML },
  502. htx => sub { new HTML },
  503. js => sub { new JS }, # Javascript
  504. jsx => sub { new JS },
  505. css => sub { new CSS }, # Cascading style sheet
  506. csx => sub { new CSS },
  507. };
  508. sub CreateTypeFilter {
  509. my $sub = $types->{lc shift};
  510. &$sub;
  511. }
  512. ##############################################################################
  513. #
  514. # Command line parsing
  515. #
  516. sub Usage {
  517. die "Usage: htmlcln [-t [html|js|css]] [-DVAR[=value]...] [-v] -o outfile infile\n";
  518. }
  519. #
  520. # AddDefine - Handle a -D command line option.
  521. #
  522. sub AddDefine {
  523. my $line = shift;
  524. if ($line =~ /=/) {
  525. $VAR{$`} = $';
  526. } else {
  527. $VAR{$line} = 1;
  528. }
  529. }
  530. sub ParseCommandLine {
  531. #
  532. # Scream through the command line arguments.
  533. #
  534. while ($#ARGV >= 0 && $ARGV[0] =~ /^-(.)(.*)/) {
  535. # $1 - command
  536. # $2 - optional argument
  537. my($cmd, $val) = ($1, $2);
  538. shift(@ARGV);
  539. if ($cmd eq 't') {
  540. $val = shift(@ARGV) if $val eq '';
  541. $force_type = $val;
  542. } elsif ($cmd eq 'D') {
  543. AddDefine($val);
  544. } elsif ($cmd eq 'o') {
  545. $val = shift(@ARGV) if $val eq '';
  546. $outfile = $val;
  547. } elsif ($cmd eq 'v') {
  548. $verbose = 1;
  549. } else {
  550. Usage();
  551. }
  552. }
  553. #
  554. # What's left should be a filename, and there should be an output file.
  555. #
  556. my $infile = shift(@ARGV);
  557. Usage() unless defined $infile && defined $outfile && $#ARGV == -1;
  558. #
  559. # If the filetype is not being overridden, then take it from the filename.
  560. #
  561. if (!defined $force_type) {
  562. ($force_type) = $infile =~ /\.(.*)/;
  563. }
  564. #
  565. # Include debug goo only if building DBG=1 and FULL_DEBUG is set in the
  566. # environment.
  567. #
  568. $::RetailVersion = 0 if defined($VAR{"DBG"}) && defined($ENV{"FULL_DEBUG"});
  569. $infile;
  570. }
  571. ##############################################################################
  572. #
  573. # File processing
  574. #
  575. sub ProcessFile {
  576. my $infile = shift;
  577. #
  578. # Create the final sink.
  579. #
  580. my $sink = new OutFile;
  581. $sink->SetOutput($outfile);
  582. #
  583. # Set up the default filter based on the file type.
  584. #
  585. my $Type = CreateTypeFilter($force_type);
  586. $Type->SetSink($sink);
  587. #
  588. # Create the DebugFilter which sits at the top of the chain.
  589. #
  590. my $Filter = new DebugFilter;
  591. $Filter->SetSink($Type);
  592. #
  593. # All the plumbing is ready - start pumping data.
  594. #
  595. open(I, $infile) || die "Cannot open $infile for reading ($!)\n";
  596. while (<I>) {
  597. $Filter->Add($_);
  598. }
  599. $Filter->Flush();
  600. $Filter->Close();
  601. }
  602. ##############################################################################
  603. #
  604. # Main program
  605. #
  606. {
  607. my $infile = ParseCommandLine();
  608. ProcessFile($infile);
  609. }