InfoQube IM - Community

11/25/2010 the perl script Jibz uses to create the CHM file.

#!/usr/local/bin/perl -w

# Convert dumped printer friendly version of the online drupal user manual
# to a chm compilable state.
#
# Pass filename as arg.

use strict;
use open IO => ":utf8"; # all files are assumed to be utf8

# get filename from command line
my $filename = shift || die "syntax: dump2chm.pl <html file>";

# open html dump file
open FH, "<$filename" or die "couldn't open $filename";

# open files used by the chm compiler
open my $pfile, ">index.hhp" or die "couldn't open index.hhp";
open my $cfile, ">index.hhc" or die "couldn't open index.hhc";
open my $kfile, ">index.hhk" or die "couldn't open index.hhk";
open my $ifile, ">index.html" or die "couldn't open index.html";

# get date (snip from the internets)
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
$year += 1900; ## $year contains no. of years since 1900, to add 1900 to make Y2K compliant
my @mabbr = qw( Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec );

my $curnode = -1;
my $curnesting = 1;

# this will be the current node file when we open one
my $nfile;

# this hash will be used to store info about internal nodes
my %nodes;

# print headers
print $pfile <<EOT;
[OPTIONS]
Auto Index=Yes
Compiled file=InfoQube.chm
Compatibility=1.1
Full-text search=Yes
Contents file=index.hhc
Default Window=main
Default topic=index.html
Index file=index.hhk
Language=0x409 English (United States)
Title=InfoQube User Manual

[WINDOWS]
main="InfoQube Information Manager","index.hhc","index.hhk","index.html","index.html",,,,,0x23520,,0x387e,,,,,,,,0

[FILES]
index.html
EOT

print $cfile <<EOT;
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
<HTML><HEAD></HEAD><BODY>
<OBJECT type="text/site properties">
<param name="FrameName" value="right">
</OBJECT>
EOT

print $kfile <<EOT;
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
<HTML><HEAD></HEAD><BODY>
<OBJECT type="text/site properties">
<param name="FrameName" value="right">
</OBJECT>
<UL>
EOT

print $ifile <<EOT;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xml:lang="en" xmlns="http://www.w3.org/1999/xhtml" lang="en"><head>
<title>InfoQube User Manual</title></head><body>

<h1>InfoQube User Manual</h1>

<p>This manual has been automagically generated by <tt>dump2chm.pl</tt> on $mabbr[$mon] $mday $year.</p>

<p>It is a (rough) conversion of the online <a href="https://infoqubeim.com/drupal5/index.php?q=node/48">InfoQube User Manual</a>
and may contain some bad links, you've been warned.</p>

</body></html>
EOT

# scan file and collect used node numbers
while (<FH>)
{
# new drupal node
# format is <div id="node-61" class="section-2">
if (my ($node, $nesting) = /^<div id="?node-(\d+)"? class="?section-(\d+)"?>$/i) {
$nodes{$node} = $nesting;
}
}

# reset file
seek FH, 0, 0;

# loop through the html dump and write to appropriate files
while (<FH>)
{
# new drupal node
# format is <div id="node-61" class="section-2">
# ie uses capitals and no quotes, hence the optional quotes and ignore case option
if (my ($node, $nesting) = /^<div id="?node-(\d+)"? class="?section-(\d+)"?>$/i) {

$curnode = $node;

# print end of html to last node file if any
$nfile and print $nfile "</body></html>\n";

# open new file based on node number
open $nfile, ">node$node.html" or die "couldn't open node$node.html";

# print node header to new node file
print $nfile qq#<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n#;
print $nfile qq#<html xml:lang="en" xmlns="http://www.w3.org/1999/xhtml" lang="en"><head>\n#;

# add filename to index.hhp
print $pfile "node$node.html\n";

# finish last item in index.hhc setting the icon to book for nodes with children
if ($nesting > 1) {
if ($curnesting < $nesting) {
print $cfile qq#<param name="ImageNumber" value="1"></OBJECT>\n#;
} else {
print $cfile qq#<param name="ImageNumber" value="11"></OBJECT>\n#;
}
}

# add list start or end tags to index.hhc based on the new nesting depth
while ($curnesting > $nesting) {
print $cfile "</UL>\n";
--$curnesting;
}

while ($curnesting < $nesting) {
print $cfile "<UL>\n";
++$curnesting;
}

next;
}

# new section
# format is <h1 class="book-heading">3. InfoQube User Interface</h1>
# note: there can be line breaks in the titles! (e.g. 3.2.10)
if (my ($title) = /^<h1 class="?book-heading"?>(.+)$/i) {

chomp $title;

# append next line while we cannot remove the end tag
while ($title !~ s/<\/h1>$//i) {
$title .= <FH>;
chomp $title;
}

# make sure title is printed correctly to current node file
$_ = qq#<h1 class="book-heading">$title</h1>\n#;

# finish node file header and start body
print $nfile "<title>$title</title></head><body>\n";

# add title to index.hhc
print $cfile qq#<LI><OBJECT type="text/sitemap"><param name="Name" value="$title"><param name="Local" value="node$curnode.html">#;

# remove any section number and add to index.hhk
$title =~ s/^\d+(?:\.\d+)*\.?\s+//;
print $kfile qq#<LI><OBJECT type="text/sitemap"><param name="Name" value="$title"><param name="Local" value="node$curnode.html"></OBJECT>\n#;
}

# Fix local urls that got messed up in the dump
s#"https://infoqubeim\.com/index\.php\?q=#"https://infoqubeim.com/drupal5/index.php?q=#;

# fix internal links
# format is <a href="https://infoqubeim.com/drupal5/index.php?q=node/864" class="link-node">5.3 Sorting</a>
# the g option makes the match in the while condition continue, effectively looping over all occurences in the string
# note: can be wrapped across multiple lines
my $temp = $_;
while ($temp =~ m#"https://infoquebim\.com/drupal5/index\.php\?q=node\/(\d+)"#g) {
my $node = $1;
if (exists $nodes{$node}) {
s#"https://infoqubeim\.com/drupal5/index\.php\?q=node\/$node"#"node$node.html"#g;
}
}

# warn about input tags, since they are sometimes used in place of img tags
if (/<input /i) {
print "<input> tag warning in node$curnode.html\n";
}

# replace unicode characters with html entities
# there has to be a 'nicer' way of doing this
s/([\x80-\x{10ffff}])/"&#" . ord($1) . ";"/ge;

# print to node file if any
$nfile and print $nfile $_;
}

# finish up
print $cfile qq#<param name="ImageNumber" value="11"></OBJECT>\n#;
print $cfile "</UL>\n";

print $kfile "</UL>\n";

Perl script for generating the CHM manual