#!/usr/bin/perl -T
# See http://www.htmlhelp.com/tools/validator/offline/
#####################################################################
#
# Offline HTMLHelp.com Validator
# by Liam Quinn
#
# This is a simplified version of the online WDG HTML Validator
# found at .
#
# Copyright (c) 1998-2010 by Liam Quinn
# This program is free software; you can redistribute it
# and/or modify it under the same terms as Perl itself.
#
# Contributors:
# * Ville Skytta
# * John Goebel
#
#####################################################################
#####################################################################
# Required libraries #
######################
# These are all standard Perl modules; we'll check for URI and LWP
# later on demand.
use strict;
use Getopt::Long qw(GetOptions);
use Text::Wrap qw(wrap);
use POSIX qw(:fcntl_h);
# If File::Spec::Functions isn't available, let's fall back quietly
# to a replacement function.
eval {
require File::Spec::Functions;
File::Spec::Functions->import('catfile');
};
*catfile = sub { join('/', @_) } if $@;
#####################################################################
#####################################################################
# Variables to define #
#######################
# Version and identifier of this program
my $VERSION = '1.2.3';
my $progname = "Offline HTMLHelp.com Validator, Version $VERSION
by Liam Quinn ";
my $usage = "Usage: validate [OPTION] [FILE...]";
# SGML directory (catalog, DTDs, SGML declarations)
my $sgmlDir = '/usr/local/share/wdg/sgml-lib';
# Location of lq-nsgmls executable
my $nsgmlsLocation = '/usr/bin/nsgmls';
# lq-nsgmls command line
# The SGML declaration and HTML document's filename will be appended
# to this string
my $nsgmls = "$nsgmlsLocation -E0 -s";
# Warnings to pass on command-line to lq-nsgmls, if desired
my $nsgmlsWarnings = '-wnon-sgml-char-ref -wmin-tag';
my $nsgmlsXMLWarnings = '-wxml';
# lq-nsgmls "errors" that are not reported unless warnings are requested.
# These are true errors in XML validation, but they should only be
# reported as warnings otherwise.
my %errorAsWarning = (
' net-enabling start-tag not supported in {{XML}}' => 1,
' unclosed start-tag' => 1,
' unclosed end-tag' => 1
);
# Catalog files for HTML/SGML and XHTML/XML
my $htmlCatalog = catfile($sgmlDir, 'catalog');
my $xhtmlCatalog = catfile($sgmlDir, 'xhtml.soc');
# Where to direct errors (typically *STDOUT or *STDERR)
my $errout = *STDOUT;
# Versions of HTML associated with a given FPI
my %HTMLversion = (
'PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.2//EN"' => 'XHTML-MP 1.2',
'PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.1//EN"' => 'XHTML-MP 1.1',
'PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.0//EN"' => 'XHTML-MP 1.0',
'PUBLIC "-//W3C//DTD XHTML+RDFa 1.0//EN"' => 'XHTML+RDFa 1.0',
'PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN"' => 'XHTML 1.1 plus MathML 2.0 plus SVG 1.1',
'PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN"' => 'XHTML 1.1 plus MathML 2.0',
'PUBLIC "-//W3C//DTD MathML 2.0//EN"' => 'MathML 2.0',
'PUBLIC "-//W3C//DTD XHTML 1.1//EN"' => 'XHTML 1.1',
'PUBLIC "-//WAPFORUM//DTD WML 1.3//EN"' => 'WML 1.3',
'PUBLIC "-//WAPFORUM//DTD WML 1.2//EN"' => 'WML 1.2',
'PUBLIC "-//WAPFORUM//DTD WML 1.1//EN"' => 'WML 1.1',
'PUBLIC "-//WAPFORUM//DTD WML 1.0//EN"' => 'WML 1.0',
'PUBLIC "-//W3C//DTD XHTML Basic 1.0//EN"' => 'XHTML Basic',
'PUBLIC "ISO/IEC 15445:2000//DTD HyperText Markup Language//EN"' => 'ISO/IEC 15445:2000',
'PUBLIC "ISO/IEC 15445:2000//DTD HTML//EN"' => 'ISO/IEC 15445:2000',
'PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"' => 'XHTML 1.0 Strict',
'PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' => 'XHTML 1.0 Transitional',
'PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"' => 'XHTML 1.0 Frameset',
'PUBLIC "-//W3C//DTD HTML 4.01//EN"' => 'HTML 4.01 Strict',
'PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"' => 'HTML 4.01 Transitional',
'PUBLIC "-//W3C//DTD HTML 4.01 Frameset//EN"' => 'HTML 4.01 Frameset',
'PUBLIC "-//W3C//DTD HTML 4.0//EN"' => 'HTML 4.0 Strict',
'PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"' => 'HTML 4.0 Transitional',
'PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN"' => 'HTML 4.0 Frameset',
'PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"' => 'HTML 3.2',
'PUBLIC "-//W3C//DTD HTML 3.2 Draft//EN"' => 'HTML 3.2',
'PUBLIC "-//W3C//DTD HTML 3.2//EN"' => 'HTML 3.2',
'PUBLIC "-//W3C//DTD HTML Experimental 970421//EN"' => 'HTML 3.2 + Style',
'PUBLIC "-//W3O//DTD W3 HTML 3.0//EN"' => 'HTML 3.0 Draft',
'PUBLIC "-//IETF//DTD HTML 3.0//EN//"' => 'HTML 3.0 Draft',
'PUBLIC "-//IETF//DTD HTML 3.0//EN"' => 'HTML 3.0 Draft',
'PUBLIC "-//IETF//DTD HTML i18n//EN"' => 'HTML 2.0 + i18n',
'PUBLIC "-//IETF//DTD HTML//EN"' => 'HTML 2.0',
'PUBLIC "-//IETF//DTD HTML 2.0//EN"' => 'HTML 2.0',
'PUBLIC "-//IETF//DTD HTML Level 2//EN"' => 'HTML 2.0',
'PUBLIC "-//IETF//DTD HTML 2.0 Level 2//EN"' => 'HTML 2.0',
'PUBLIC "-//IETF//DTD HTML Level 1//EN"' => 'HTML 2.0 Level 1',
'PUBLIC "-//IETF//DTD HTML 2.0 Level 1//EN"' => 'HTML 2.0 Level 1',
'PUBLIC "-//IETF//DTD HTML Strict//EN"' => 'HTML 2.0 Strict',
'PUBLIC "-//IETF//DTD HTML 2.0 Strict//EN"' => 'HTML 2.0 Strict',
'PUBLIC "-//IETF//DTD HTML Strict Level 2//EN"' => 'HTML 2.0 Strict',
'PUBLIC "-//IETF//DTD HTML 2.0 Strict Level 2//EN"' => 'HTML 2.0 Strict',
'PUBLIC "-//IETF//DTD HTML Strict Level 1//EN"' => 'HTML 2.0 Strict Level 1',
'PUBLIC "-//IETF//DTD HTML 2.0 Strict Level 1//EN"' => 'HTML 2.0 Strict Level 1'
);
# SGML declarations for a given level of HTML
my %sgmlDecl = (
'XHTML-MP 1.2' => catfile($sgmlDir, 'xhtml-basic10','xml1.dcl'),
'XHTML-MP 1.1' => catfile($sgmlDir, 'xhtml-basic10','xml1.dcl'),
'XHTML-MP 1.0' => catfile($sgmlDir, 'xhtml-basic10','xml1.dcl'),
'XHTML+RDFa 1.0' => catfile($sgmlDir, 'xhtml11', 'xml1n.dcl'),
'XHTML 1.1 plus MathML 2.0 plus SVG 1.1' => catfile($sgmlDir, 'xhtml11', 'xml1n.dcl'),
'XHTML 1.1 plus MathML 2.0' => catfile($sgmlDir, 'xhtml11', 'xml1n.dcl'),
'MathML 2.0' => catfile($sgmlDir, 'xhtml11', 'xml1n.dcl'),
'XHTML 1.1' => catfile($sgmlDir, 'xhtml11', 'xml1n.dcl'),
'WML 1.3' => catfile($sgmlDir, 'xhtml1', 'xhtml1.dcl'),
'WML 1.2' => catfile($sgmlDir, 'xhtml1', 'xhtml1.dcl'),
'WML 1.1' => catfile($sgmlDir, 'xhtml1', 'xhtml1.dcl'),
'WML 1.0' => catfile($sgmlDir, 'xhtml1', 'xhtml1.dcl'),
'XHTML Basic' => catfile($sgmlDir, 'xhtml-basic10','xml1.dcl'),
'ISO/IEC 15445:2000' => catfile($sgmlDir, '15445.dcl'),
'XHTML 1.0 Strict' => catfile($sgmlDir ,'xhtml1', 'xhtml1.dcl'),
'XHTML 1.0 Transitional' => catfile($sgmlDir, 'xhtml1', 'xhtml1.dcl'),
'XHTML 1.0 Frameset' => catfile($sgmlDir, 'xhtml1', 'xhtml1.dcl'),
'HTML 4.01 Strict' => catfile($sgmlDir, 'HTML4.dcl'),
'HTML 4.01 Transitional' => catfile($sgmlDir, 'HTML4.dcl'),
'HTML 4.01 Frameset' => catfile($sgmlDir, 'HTML4.dcl'),
'HTML 4.0 Strict' => catfile($sgmlDir, 'HTML4.dcl'),
'HTML 4.0 Transitional' => catfile($sgmlDir, 'HTML4.dcl'),
'HTML 4.0 Frameset' => catfile($sgmlDir, 'HTML4.dcl'),
'HTML 3.2' => catfile($sgmlDir, 'HTML32.dcl'),
'HTML 3.2 + Style' => catfile($sgmlDir, 'html-970421.decl'),
'HTML 3.0 Draft' => catfile($sgmlDir, 'HTML3.dcl'),
'HTML 2.0 + i18n' => catfile($sgmlDir, 'i18n.dcl'),
'HTML 2.0' => catfile($sgmlDir, 'html.dcl'),
'HTML 2.0 Strict' => catfile($sgmlDir, 'html.dcl'),
'HTML 2.0 Level 1' => catfile($sgmlDir, 'html.dcl'),
'HTML 2.0 Strict Level 1' => catfile($sgmlDir, 'html.dcl'),
'Unknown' => catfile($sgmlDir, 'custom.dcl'),
# For generic XML validation (using the --xml option)
'XML' => catfile($sgmlDir, 'xhtml1', 'xhtml1.dcl'),
);
# XHTML DTDs
my %xhtml = (
'XHTML-MP 1.2' => 1,
'XHTML-MP 1.1' => 1,
'XHTML-MP 1.0' => 1,
'XHTML+RDFa 1.0' => 1,
'XHTML 1.1 plus MathML 2.0 plus SVG 1.1' => 1,
'XHTML 1.1 plus MathML 2.0' => 1,
'MathML 2.0' => 1,
'XHTML 1.1' => 1,
'WML 1.3' => 1,
'WML 1.2' => 1,
'WML 1.1' => 1,
'WML 1.0' => 1,
'XHTML Basic' => 1,
'XHTML 1.0 Strict' => 1,
'XHTML 1.0 Transitional' => 1,
'XHTML 1.0 Frameset' => 1,
'XML' => 1,
);
# Default DOCTYPE if the document is missing a DOCTYPE
my $defaultDoctype = '';
# Default DOCTYPE if the document contains frames
my $defaultFramesetDoctype = '';
# Error for missing DOCTYPE
my $noDoctype = "missing document type declaration; assuming HTML 4.01 Transitional";
# Error for missing DOCTYPE in a Frameset document
my $noFramesetDoctype = "missing document type declaration; assuming HTML 4.01 Frameset";
#####################################################################
#####################################################################
#
# The rest of the script...
#
#####################################################################
# Get rid of unsafe environment variables, see perlsec
delete(@ENV{qw(PATH IFS CDPATH ENV BASH_ENV)});
# Flush output buffer
$| = 1;
### Get user input ###
# Character encoding to use (optional)
my $charsetOverride;
# Verbose output (optional)
my $verbose;
# Emacs-friendly output
my $emacs = ($ENV{EMACS} && $ENV{EMACS} eq 't');
# XML mode
my $xml;
# Whether warnings are desired
my $warnings;
# HTTP headers to send
my @headers;
# Help and version info
my $help;
my $versionInfo;
GetOptions("xml" => \$xml, "charset=s" => \$charsetOverride,
"verbose" => \$verbose, "help|h" => \$help,
"version|v" => \$versionInfo, "emacs!" => \$emacs,
"warn|w|W" => \$warnings, "header=s" => \@headers);
# Files to validate
my @files = @ARGV;
######################
my $errors = 0;
if ($versionInfo || $help) {
if ($versionInfo) {
print "$progname\n";
}
if ($help) {
&helpText;
}
exit $errors;
}
if ($#files == -1) {
push(@files, '-');
}
# Check that nsgmls is available before we get too far
unless (-e $nsgmlsLocation) {
&error("$nsgmlsLocation is not installed");
exit $errors;
}
unless (-x _) {
&error("$nsgmlsLocation is not executable");
exit $errors;
}
# Check if we can use URIs.
eval {
require URI;
require LWP::UserAgent;
};
my $uri_ok = !$@;
my $ua;
my $file;
foreach $file (@files) {
my $tempname = undef;
my $tempfh = undef;
my $charset = $charsetOverride;
# Read in document
my $document = "";
my $fileIsURL = 0;
if ($file ne '-') {
if ($uri_ok && $file =~ m|^\w+://.+|i) {
$fileIsURL = 1;
unless ($ua) {
$ua = LWP::UserAgent->new(env_proxy => 1, keep_alive => 1);
foreach (@headers) {
if (/^([^\s:]+)\s*:\s*(.*)/) {
if (lc($1) eq 'user-agent') {
$ua->agent($2);
} else {
$ua->default_header($1 => $2);
}
}
}
}
my $uri = URI->new($file);
unless ($ua->is_protocol_supported($uri)) {
&error('Unsupported protocol: ' . $uri->scheme());
next;
}
my $res = $ua->get($uri->canonical());
if ($res->is_success()) {
$document = $res->content();
unless ($charset) {
my $contentType = $res->header('Content-Type');
if ($contentType && $contentType =~ /[\s;]charset\s*=\s*"?([^,"\s]+)/io) {
$charset = $1;
}
}
} else {
&error($res->status_line());
next;
}
} else {
unless (-e $file) {
&error("File $file does not exist.");
next;
}
unless (-r _) {
&error("File $file is not readable.");
next;
}
open(IN, $file) || die "Unexpected error reading $file: $!\n";
while () {
$document .= $_;
}
close(IN);
}
} else {
while (<>) {
$document .= $_;
}
}
unless ($charset) {
# Check for a META element specifying the character encoding
if ($document =~ m#]*http\-equiv\s*=\s*["']?Content\-Type["']?[^>]*)>#iso) {
my $metaAttributes = $1;
if ($metaAttributes =~ m#\scontent\s*=\s*["']?.*[\s;]charset\s*=\s*['"]?([^"']+)#iso) {
$charset = $1;
}
}
}
my @errors; # queue of errors
my @externalErrors; # queue of errors in an external DTD
my $lineAdjust = 0; # account for line number changes if we add a DOCTYPE
# Determine the level of HTML
my $htmlLevel;
my $fileToValidate = $file;
if ($xml) {
$htmlLevel = 'XML';
} else {
$htmlLevel = 'Unknown';
}
if ($document =~ /]*)>/iso) {
my $doctypeMeat = $1;
if ($doctypeMeat =~ /PUBLIC\s+["']([^"']*)["']/iso) {
$htmlLevel = $HTMLversion{"PUBLIC \"$1\""} || $htmlLevel;
}
if ($fileIsURL || $file eq '-') {
($tempname, $tempfh) = getTempFile();
print $tempfh "$document";
close($tempfh);
$fileToValidate = $tempname;
}
} else { # Missing DOCTYPE
# Add a default DOCTYPE
my ($insertedDoctype, $doctypeError);
if ($document =~ /