>Date: Sun, 24 Dec 1995 13:54:30 -0500
>From: selene@niagara.com (Rowan Shirkie)
>Subject: E-mail to HTML
>
>Does anyone have any experience with the E-mail to HTML software that they
>could share?
>
>I want to create an HTML archive of my digests, and point WWW
>browser-surfers and new mailing list subscribers to it. I would use the WWW
>site to publish the mailing list "info" file and list policies as well,
>making it a sort of self-serve request processor.
>
>That way, I figure people with only a casual interest can come and go
>without churning subscriptions. New subscribers can avoid revisiting topics
>that have outlived their interest lifespan on the list itself.
>
>I have seen Hypermail. Does any one have experience with it? Or any other
>converter?
I have a home-grown digest-to-html program that uses perl 5.0. I have an
older one that uses perl 4.0, but it requires a more ridgid digest format
(my majordomo digest program, pretty well).
The reason I wrote my own was that the others, like Hypermail, only worked
on mail files, not digests. And all my archives are digests, so I was
stuck. Hypermail is particularly bad for that because it only parses
mailbox "From " style lines, not Arpa-style "From: " lines (one has a colon,
one doesn't; more importantly, digests only have the latter).
The other reason was I like perl and html :-)
Note that my script needs a little tweaking for the "end of digest" message
(look for "End of Volume"), and the digest name / date / volume number infor
from the title. Look around line 137 for that.
To see a real-life example of the output, check out:
http://reality.sgi.com/employees/pdc/bass/v02q4/topics.html
http://reality.sgi.com/employees/pdc/bass/v02q4/contents.html
Note that subjects are linked together, so from any subject you can jump to
the next or previous subject.
#! /bin/sh
# This is a shell archive. Remove anything before this line, then feed it
# into a shell via "sh file" or similar. To overwrite existing files,
# type "sh file -c".
# The tool that generated this appeared in the comp.sources.unix newsgroup;
# send mail to comp-sources-unix@uunet.uu.net if you want that tool.
# If this archive is complete, you will see the following message at the end:
# "End of shell archive."
# Contents: digest_to_html.perl
# Wrapped by pdc@lunch.engr.sgi.com on Thu Jan 4 17:24:47 1996
PATH=/bin:/usr/bin:/usr/ucb ; export PATH
if test -f 'digest_to_html.perl' -a "${1}" != "-c" ; then
echo shar: Will not clobber existing file \"'digest_to_html.perl'\"
else
echo shar: Extracting \"'digest_to_html.perl'\" \(7960 characters\)
sed "s/^X//" >'digest_to_html.perl' <<'END_OF_FILE'
X#!/usr/bin/perl5 -w
X
X$header_sep = "\n+Topics:[^\000]*?\n--------------------*\n+";
X$message_sep = "\n+--------------------*\n+(?=From:|Subject:|Date:)";
X
Xundef %subjcount; # avoid "used only once" warnings
X
X$dir = ".";
X
Xwhile ($ARGV[0] =~ m/^(-.*)/ && shift) {
X if ($1 eq "-d") {
X $dir = shift;
X } else {
X die "usage $0 [-d html_dir] file [file...]\n";
X }
X}
X
Xundef($/);
X$* = 1;
X
Xopen(CONTENTS, ">$dir/contents.html") ||
X die "Can't open $dir/contents.html: $!\n";
Xopen(TOPICS, ">$dir/topics.html") ||
X die "Can't open $dir/topics.html: $!\n";
X
X#
X# Pass 1 -- Read each file and get a list of subjects. This is only so
X# we can be nice and offer to jump to the next article with
X# the same subject.
X#
X
Xprint "Pass 1...\n";
X
Xforeach $file (@ARGV) {
X next if $file =~ /\.html$/;
X open(DIGEST, "<$file") || warn "Can't open $file: $!\n",next;
X print "$file\n";
X $_ = <DIGEST>; # read the whole file
X close(DIGEST);
X
X push(@okayfiles, $file);
X $file =~ s!.*/!!;
X
X # split off digest header
X if (!/$header_sep/o) {
X print "$file: header split didn't work: \$header_sep probably not right for this digest!\n";
X $body = $_;
X }
X else {
X $body = $';
X }
X if (! $body =~ /$message_sep/o) {
X print "$file: message split didn't work: \$message_sep probably not right for this digest!\n";
X }
X # just /$message_sep/o should work, but doesn't, perl5.001n
X eval "\@msgs = split(/$message_sep/, \$body)"; # split body into messages
X
X $subjno = 0;
X
X foreach $msg (@msgs){
X $subjno++;
X $subj = "";
X ($msg =~ /^Subject:\s+(.*)/m) && ($subj=$1);
X
X $subj =~ s/&/&/g; # html escape seqs
X $subj =~ s/</</g;
X $subj =~ s/>/>/g;
X $subj =~ s/"/"/g;
X
X $subj =~ s/\s+$//;
X
X # massage Subject for topics
X
X 1 while ($subj =~ s/^Re(2|\[\d+\])?[: ]\s*//i); # trim all Re:'s
X if ($subj !~ /^\s*$/ &&
X $subj !~ /\w+ Digest, Volume \d+,/i &&
X $subj !~ /\w+ Digest V\d+ #\d+/i)
X {
X #
X # Make a key that's all lower case, and all alpha-numeric to
X # reduce duplicate topics that differ only by those. This
X # also results in a list of topics sorted case-independent.
X #
X ($key = $subj) =~ tr/A-Z/a-z/;
X $key =~ s/&(amp|lt|gt|quot);//g;
X $key =~ s/\W+//g;
X $subjrefs{$key} .= "$file.html#$subjno\001";
X if (!defined($realsubj{$key})) {
X $realsubj{$key} = $subj;
X }
X }
X }
X}
X
Xprint "Pass 2...\n";
X
X#
X# Pass 2 -- Read each file AGAIN (hey, that's what file system caches are
X# for, right?) and write out an HTML version while writing the
X# by-date and by-subject files as we go.
X#
X# The files could be stored as they were read in, but that
X# increases memory usage.... This is left as an exercise
X# to the reader :-)
X#
X
Xforeach $file (@okayfiles) {
X open(DIGEST, "<$file") || warn "Can't open $file: $!\n",next;
X $_ = <DIGEST>; # read the whole file
X close(DIGEST);
X
X $file =~ s!.*/!!;
X open(HTML,">$dir/$file.html") ||
X warn "Can't open $dir/$file.html: $!\n",next;
X print "$dir/$file.html\n";
X
X $subjno = 0;
X s/&/&/g; # html escape seqs
X s/</</g;
X s/>/>/g;
X s/"/"/g;
X
X /$header_sep/o; # split off digest header
X $hdr = "$`$&";
X $body = $';
X $body =~ s/\n+----*\s*End of Volume \d+\s*-----*[\n\s]+\Z//i;
X @hdrs = split(/\n\n+/, $hdr); # split hdr by paragraphs
X # just /$message_sep/o should work, but doesn't, perl5.001n
X eval "\@msgs = split(/$message_sep/, \$body)"; # split body into messages
X
X shift(@hdrs); # skip mail header
X $first = shift(@hdrs); # grab first line
X #pop(@hdrs); # ... and topics
X #pop(@hdrs); # ... and delimiter
X do { $tmp=pop(@hdrs); } until $tmp =~ /Topics/;
X
X ($title,$volume,$date) = $first =~ /(.*Digest),\s+((?:Volume|Issue).*?),\s+(.*)/;
X if ($title eq "" || $volume eq "" || $date eq "") {
X print "$file: title wasn't parsed right: \$title = $title, \$volume = $volume, \$date = $date\n";
X }
X
X print HTML "<TITLE>$title, $volume</TITLE>\n";
X print HTML "<H1>$title</H1>\n<H2>$date<BR>$volume</H2>\n";
X print HTML "<PRE>",join("\n\n",@hdrs),"</PRE>\n<DL>\n<DT>Topics:<DD>\n";
X
X if (!$contents_title) {
X $contents_title = "$title Archives";
X print CONTENTS "<TITLE>$contents_title by Digest Date</TITLE>\n";
X print CONTENTS "<H2>$contents_title by Digest Date</H2>\n";
X }
X if (!$first_date) {
X $first_date = $date;
X }
X $last_date = $date;
X
X print CONTENTS qq!<H3><A HREF="$file.html">$title, $volume</A><BR>!,
X "$date</H3>\n";
X print CONTENTS "<DL><DT>Contents:<DD>\n";
X
X foreach $msg (@msgs) {
X #print ">>>$msg<<<\n"; #debug
X $subjno++;
X $msg_count++;
X $subj = "";
X $from = "";
X ($msg =~ /^Subject:\s+(.*)/m) && ($subj=$1);
X $subj =~ s/\s+$//;
X
X # massage From
X
X if ($msg =~ /^From: (.*)/im) {
X $_ = $1;
X if (/(.*)<.*>/) {
X $from = $1;
X }
X elsif (/\((.*)\)/) {
X $from = $1;
X }
X else {
X $from = $_;
X }
X $from =~ s/^\s+//;
X $from =~ s/\s+$//;
X $from =~ s/^"(.*)"$/$1/;
X $expr = $from;
X $expr =~ s/([\[\]\(\)\*\+\?\.])/\\$1/g;
X $msg =~ s/^From:\s+(.*)($expr)(.*)/From: $1<em>$2<\/em>$3/im;
X }
X
X # massage message body
X
X $msg =~ s/^Subject:\s+(.*)/Subject: <STRONG>$1<\/STRONG>/im;
X# $msg =~ s/\n/<BR>\n/g;
X $msg =~ s/^(>.*)/<EM>$1<\/EM>/gm;
X $msg = qq!<A NAME="$subjno"></A><PRE>\n$msg\n</PRE>\n!;
X
X if ($from ne "" && $subj ne "") {
X print HTML qq!<A HREF="#$subjno">$subj</A><BR>\n!;
X print CONTENTS qq!<A HREF="$file.html#$subjno">$subj</A>!,
X ", <EM>$from</EM><BR>\n";
X }
X
X # massage Subject for topics
X
X 1 while ($subj =~ s/^Re(2|\[\d+\])?[: ]\s*//i); # trim all Re:'s
X if ($subj !~ /^\s*$/ &&
X $subj !~ /\w+ Digest, Volume \d+,/i &&
X $subj !~ /\w+ Digest V\d+ #\d+/i)
X {
X # Make a key just like in pass 1.
X ($key = $subj) =~ tr/A-Z/a-z/;
X $key =~ s/&(amp|lt|gt|quot);//g;
X $key =~ s/\W+//g;
X #if (!defined($subjlist{$key})) {$subjlist{$key} = ""}
X $subjlist{$key} .= "$file.html#$subjno\002$from\002$date\001";
X @subjref = split("\001", $subjrefs{$key});
X $idx = $subjcount{$key}++;
X if ($#subjref >= 0 && $subjref[$idx] ne "$file.html#$subjno") {
X print "$key missed subj sequence!\n";
X print "\$#subjref = $#subjref, \$idx = $idx, \$subjref[$idx] = $subjref[$idx]\n";
X print "expected $file.html#$subjno\n";
X }
X if ($#subjref <= 0) {
X # nothing
X } elsif ($idx > $#subjref) {
X print "error! \$idx ($idx) > \$#subjref ($#subjref), $key\n";
X } elsif ($idx == 0) {
X $msg .= "\n<P><CENTER>[no prev subject] " .
X "<A HREF=\"$subjref[1]\">[next subject]</A>" .
X "</CENTER>\n";
X } elsif ($idx == $#subjref) {
X $msg .= "\n<P><CENTER>" .
X "<A HREF=\"$subjref[$idx-1]\">[prev subject]</A> " .
X "[no next subject]</CENTER>\n";
X } else {
X $msg .= "\n<P><CENTER>" .
X "<A HREF=\"$subjref[$idx-1]\">[prev subject]</A> " .
X "<A HREF=\"$subjref[$idx+1]\">[next subject]</A>" .
X "</CENTER>\n";
X }
X }
X }
X
X print CONTENTS "</DL>\n";
X
X print HTML "</DL><HR>\n",join("<HR>\n",@msgs),"\n";
X
X close HTML;
X}
X
Xclose(CONTENTS);
X
Xprint TOPICS "<TITLE>$contents_title by Subject</TITLE>\n";
Xprint TOPICS "<H2>$contents_title by Subject</H2>\n";
Xprint TOPICS "<STRONG>Starting:</STRONG> $first_date<BR>\n";
Xprint TOPICS "<STRONG>Ending:</STRONG> $last_date<BR>\n";
Xprint TOPICS "<STRONG>Messages:</STRONG> $msg_count<BR>\n";
Xprint TOPICS "<UL>\n";
X
Xforeach $key (sort keys %subjlist) {
X #
X # for each subject, record each file it was found in
X #
X @msgs = split("\001", $subjlist{$key});
X
X if ($realsubj{$key} eq "") {
X print "huh? lost subject for key \"$key\"!\n";
X }
X print TOPICS "<LI><STRONG>$realsubj{$key}</STRONG>\n<UL>\n";
X foreach $msg (@msgs) {
X ($url,$from,$date) = split("\002", $msg, 3);
X print TOPICS qq!<LI><A HREF="$url">$from</A>, $date\n!;
X }
X print TOPICS "</UL>\n\n";
X}
X
Xprint TOPICS "</UL>\n";
X
Xclose(TOPICS);
END_OF_FILE
if test 7960 -ne `wc -c <'digest_to_html.perl'`; then
echo shar: \"'digest_to_html.perl'\" unpacked with wrong size!
fi
chmod +x 'digest_to_html.perl'
# end of 'digest_to_html.perl'
fi
echo shar: End of shell archive.
exit 0
--
Paul Close pdc@sgi.com http://reality.sgi.com/employees/pdc/
No fate but what we make
|
|