#!/usr/bin/perl -w # Copyright (C) 2007 Reini Urban # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. =pod =head1 NAME wiki2omega [-u URL] WIKIDB =over 4 =item B The first argument may be a phpwiki url, a directory or a single file. If it is an url the -url option is optional, otherwise mandatory. =item B<-url> The url prefix for the xapian database to be able to link to the result page. =item B<-man> Prints the manual page and exits. =back =head1 DESCRIPTION Convert a phpwiki database to a format recognizable for xapian scriptindex, to prepare a fulltext index. =head1 SYNOPSIS # serial dump (see config.ini for DEFAULT_DUMP_DIR) DEFAULT_DUMP_DIR = /var/www/wikidb/pgsrc lwp-request -d http://localhost/phpwiki/?action=dumpserial wiki2omega -u wiki $DEFAULT_DUMP_DIR | \ scriptindex /var/lib/omega/data/wiki /usr/share/omega/wiki2omega.script # zip dump wiki2omega http://localhost/phpwiki/ | \ scriptindex /var/lib/omega/data/wiki /usr/share/omega/wiki2omega.script # single file wiki2omega -u wiki /tmp/wikidump/HomePage | \ scriptindex /var/lib/omega/data/wiki /usr/share/omega/wiki2omega.script # real life usage DEFAULT_DUMP_DIR = /var/www/wikidb/pgsrc nice /usr/bin/lwp-request -P -d -m GET "http://localhost/wiki/?action=dumpserial" nice wiki2omega -u /wiki $DEFAULT_DUMP_DIR | \ scriptindex /var/lib/omega/data/wiki /var/lib/omega/scripts/wiki2index.script \ > /var/log/omega/updateindex-wiki.log =cut use strict; use Getopt::Long; use Pod::Usage; use Digest::MD5 qw(md5_hex); use constant AZ_OK => 0; my $man = 0; my $help = 0; my $db; my $wikiurl; GetOptions('help|?' => \$help, 'man' => \$man, 'url|u=s' => \$wikiurl, ) or pod2usage(2); pod2usage(1) if $help; pod2usage(-exitstatus => 0, -verbose => 2) if $man; sub urldecode ($) { my $s = shift; $s =~ s/%([A-F0-9][A-F0-9])/chr(hex($1))/eg; return $s; } sub p ($) { # my $title = shift; my $content = shift; my $hdr = 1; my $headers = ''; my %hdr; return unless $content; $hdr{md5} = md5_hex($content); $hdr{size} = length($content); $hdr{language} = 'english'; @_ = split(/\n/, $content); LINE: while ($_ = shift @_) { s/\r$//; if ($hdr) { if (/^$/) { # headers finished, dump them my $title = $hdr{title}; print "url=$wikiurl/$title\n"; for my $h (keys %hdr) { print "$h=$hdr{$h}\n"; } print "content"; $hdr = 0; next LINE; } # ignore continuation lines my $line = $_; #while ($_ = shift @_) { # s/\r$//; # last unless /^[ \t]/; # $line .= $_; #} if ($line =~ s/^Date:\s*(.*?)\s*$/$1/i) { $hdr{date} = $line if length $line; #print "date=$line\n" if length $line; } elsif ($line =~ s/ pagename=(.*?);$/$1/) { $hdr{title} = urldecode($line) if length $line; } elsif ($line =~ s/ author=(.*?);$/$1/) { $hdr{author} = urldecode($line) if length $line; } elsif ($line =~ s/ lastmodified=(.*?);$/$1/) { $hdr{lastmod} = $line if length $line; $hdr{date} = $line if length $line; } } else { # the rest is the content: # gather headers if (/^\s*\!+(.+)$/) { $headers .= "$1 "; } print "=",$_,"\n"; } } print "headers=$headers" if $headers; } unless ($wikiurl) { $wikiurl = $db = shift or die "Syntax: $0 WIKIURL\n"; } else { $db = shift or die "Syntax: $0 -u WIKIURL DATABASE\n"; } if ($db =~ /^http/) { `wget -nv -O/tmp/wikidb.zip "$db?action=zip"`; $db = "/tmp/wikidb.zip"; } if ($db =~ /\.zip$/i) { eval "require Archive::Zip;"; my $zip = Archive::Zip->new(); die 'wikidb.zip read error' unless $zip->read( $db ) == AZ_OK; foreach my $member ($zip->members()) { unless ($member->isDirectory()) { my $page = $zip->contents($member); p ($page); print "\n"; } } } elsif (-d $db) { local $/; @_ = glob "$db/*"; while (my $filename = shift @_) { open IN, "< $filename"; my $content = ; p ($content); print "\n"; close IN; } } elsif (-f $db) { local $/; open IN, "< $db"; my $content = ; p ($content); close IN; print "\n"; } else { die "invalid argument"; }