#!/usr/bin/perl
#
# Import a given HTML file into Joomla
#
# Copyright (C) 2007 Paco Hope
# Distributed under the same license as Perl itself.
# (See the bottom of this file)
#
# Original from http://paco.to/?p=191
#
use strict;
use HTML::Parser;
use POSIX qw(strftime);
use DBI;
use DBD::mysql;
# Here's the MySQL database stuff you need to configure
$db::user = "dbuser";
$db::passwd = "dbpass";
$db::database = "joomla";
$db::hostname = "localhost";
$db::port = "3306";
$db::tablename = "jos_content";
# state for all articles (1=published)
$j::state = 1;
# numeric Joomla section and category where you want the articles inserted
$j::section = 1;
$j::category = 1;
# numeric creator ID (62 = admin) for all articles
$j::creator = 62;
###########
### No need to change anything below here
###########
# this first bit is right out of the HTML::Parser perldoc
sub title_handler {
return if shift ne "title";
my $self = shift;
$self->handler( text => sub { $j::title = shift }, "dtext" );
$self->handler(
end => sub { shift->eof if shift eq "title"; },
"tagname,self"
);
}
# Given a file name:
# Parse it for
# Get its date from the filesystem
# Insert it into the Joomla Database
sub insertFile {
my $file = shift;
my $p = HTML::Parser->new( api_version => 3 );
$p->handler( start => \&title_handler, "tagname,self" );
$p->parse_file($file);
# Get the mod time on the file, so we can set the creation time of the
# Joomla article to that time. This blatently taken from perldoc -f stat
my (
$dev, $ino, $mode, $nlink, $uid, $gid, $rdev,
$size, $atime, $mtime, $ctime, $blksize, $blocks
) = stat($file);
# Break $mtime down into its constituent parts.
# This taken from perldoc -f localtime
my ( $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst ) =
localtime($mtime);
# make a MySQL compatible date
my $mysqlDate = strftime(
"%F %T", $sec, $min, $hour, $mday,
$mon, $year, $wday, $yday, $isdst
);
# Open the file and stick its entire contents into $htmlBody
my $htmlBody;
open HTMLFILE, "<$file";
my $numread = read HTMLFILE, $htmlBody, $size;
if ( $numread != $size ) {
warn "short read on $file ($numread instead of $size )";
}
close HTMLFILE;
print "Title: \"$j::title\"\t";
print "Date: \"$mysqlDate\"\n";
$db::sth->execute(
$j::title, $j::title, $htmlBody, $j::state, $j::section,
$j::category, $mysqlDate, $j::creator, $mysqlDate
);
}
sub processDir {
my $dir = shift;
my $entry = "";
opendir( DIR, $dir ) or die "can't opendir $dir: $!";
while ( $entry = readdir(DIR) ) {
next unless -f "$dir/$entry";
print "$entry ";
insertFile("$dir/$entry");
}
closedir DIR;
}
# Default title for our articles, if one isn't defined in the HTML
$j::title = "Article";
if ( $#ARGV != 0 ) {
die "need a directory name ($#ARGV)";
}
else {
$j::dir = $ARGV[0];
if ( !-r $j::dir ) {
die "can't open $j::dir";
}
if ( !-d $j::dir ) {
die "$j::dir is not a directory";
}
}
$db::dsn = "DBI:mysql:database=$db::database;host=$db::hostname";
$db::dbh = DBI->connect( $db::dsn, $db::user, $db::passwd );
# The jos_content schema (as of 1.0.15)
# `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
# `title` varchar(100) NOT NULL DEFAULT '',
# `title_alias` varchar(100) NOT NULL DEFAULT '',
# `introtext` mediumtext NOT NULL,
# `fulltext` mediumtext NOT NULL,
# `state` tinyint(3) NOT NULL DEFAULT '0',
# `sectionid` int(11) unsigned NOT NULL DEFAULT '0',
# `mask` int(11) unsigned NOT NULL DEFAULT '0',
# `catid` int(11) unsigned NOT NULL DEFAULT '0',
# `created` datetime NOT NULL DEFAULT '0000-00-00 00:00:00',
# `created_by` int(11) unsigned NOT NULL DEFAULT '0',
# `created_by_alias` varchar(100) NOT NULL DEFAULT '',
# `modified` datetime NOT NULL DEFAULT '0000-00-00 00:00:00',
# `modified_by` int(11) unsigned NOT NULL DEFAULT '0',
# `checked_out` int(11) unsigned NOT NULL DEFAULT '0',
# `checked_out_time` datetime NOT NULL DEFAULT '0000-00-00 00:00:00',
# `publish_up` datetime NOT NULL DEFAULT '0000-00-00 00:00:00',
# `publish_down` datetime NOT NULL DEFAULT '0000-00-00 00:00:00',
# `images` text NOT NULL,
# `urls` text NOT NULL,
# `attribs` text NOT NULL,
# `version` int(11) unsigned NOT NULL DEFAULT '1',
# `parentid` int(11) unsigned NOT NULL DEFAULT '0',
# `ordering` int(11) NOT NULL DEFAULT '0',
# `metakey` text NOT NULL,
# `metadesc` text NOT NULL,
# `access` int(11) unsigned NOT NULL DEFAULT '0',
# `hits` int(11) unsigned NOT NULL DEFAULT '0'
# Now build up the query
my $q = "INSERT INTO `$db::tablename` VALUES ";
# first int is the autoincrement field. We assume that will be set by MySQL
# date: 2007-07-04 21:07:51
$q .= "(null, ?, ?, ?, '', ?, ?, 0, ?, ?, ?, '', '0000-00-00 00:00:00', ";
$q .= "0, 0, '0000-00-00 00:00:00', ?, '0000-00-00 00:00:00', '', '', ";
$q .= "'pageclass_sfx=\\nback_button=\\nitem_title=1\\nlink_titles=\\nintrotext=1\\n";
$q .= "section=0\\nsection_link=0\\ncategory=0\\ncategory_link=0\\nrating=\\nauthor=\\n";
$q .= "createdate=\\nmodifydate=\\npdf=\\nprint=\\nemail=\\nkeyref=\\ndocbook_type=', ";
$q .= "1, 0, 1, '', '', 0, 0)";
# Prepare the query once. We'll execute it many times.
$db::sth = $db::dbh->prepare($q);
print "processing '$j::dir'\n";
processDir($j::dir);
$db::dbh->disconnect;
# License Terms for this file
# This is the Perl Artistic License from
# http://www.perl.com/pub/a/language/misc/Artistic.html
#
# Preamble
#
# The intent of this document is to state the conditions under which a Package
# may be copied, such that the Copyright Holder maintains some semblance of
# artistic control over the development of the package, while giving the users
# of the package the right to use and distribute the Package in a more-or-less
# customary fashion, plus the right to make reasonable modifications.
#
# Definitions
#
# "Package" refers to the collection of files distributed by the Copyright
# Holder, and derivatives of that collection of files created through textual
# modification.
#
# "Standard Version" refers to such a Package if it has not been modified, or
# has been modified in accordance with the wishes of the Copyright Holder as
# specified below.
#
# "Copyright Holder" is whoever is named in the copyright or copyrights for the
# package.
#
# "You" is you, if you're thinking about copying or distributing this Package.
#
# "Reasonable copying fee" is whatever you can justify on the basis of media
# cost, duplication charges, time of people involved, and so on. (You will not
# be required to justify it to the Copyright Holder, but only to the computing
# community at large as a market that must bear the fee.)
#
# "Freely Available" means that no fee is charged for the item itself, though
# there may be fees involved in handling the item. It also means that recipients
# of the item may redistribute it under the same conditions they received it.
#
# 1. You may make and give away verbatim copies of the source form of the
# Standard Version of this Package without restriction, provided that you
# duplicate all of the original copyright notices and associated disclaimers.
#
# 2. You may apply bug fixes, portability fixes and other modifications derived
# from the Public Domain or from the Copyright Holder. A Package modified in
# such a way shall still be considered the Standard Version.
#
# 3. You may otherwise modify your copy of this Package in any way, provided
# that you insert a prominent notice in each changed file stating how and when
# you changed that file, and provided that you do at least ONE of the following:
#
# 1. place your modifications in the Public Domain or otherwise make them
# Freely Available, such as by posting said modifications to Usenet or an
# equivalent medium, or placing the modifications on a major archive site such
# as uunet.uu.net, or by allowing the Copyright Holder to include your
# modifications in the Standard Version of the Package.
#
# 2. use the modified Package only within your corporation or organization.
#
# 3. rename any non-standard executables so the names do not conflict with
# standard executables, which must also be provided, and provide a separate
# manual page for each non-standard executable that clearly documents how it
# differs from the Standard Version.
#
# 4. make other distribution arrangements with the Copyright Holder.
#
# 4. You may distribute the programs of this Package in object code or
# executable form, provided that you do at least ONE of the following:
#
# 1. distribute a Standard Version of the executables and library files,
# together with instructions (in the manual page or equivalent) on where to get
# the Standard Version.
#
# 2. accompany the distribution with the machine-readable source of the Package
# with your modifications.
#
# 3. give non-standard executables non-standard names, and clearly document the
# differences in manual pages (or equivalent), together with instructions on
# where to get the Standard Version.
#
# 4. make other distribution arrangements with the Copyright Holder.
#
# 5. You may charge a reasonable copying fee for any distribution of this
# Package. You may charge any fee you choose for support of this Package. You
# may not charge a fee for this Package itself. However, you may distribute this
# Package in aggregate with other (possibly commercial) programs as part of a
# larger (possibly commercial) software distribution provided that you do not
# advertise this Package as a product of your own. You may embed this Package's
# interpreter within an executable of yours (by linking); this shall be
# construed as a mere form of aggregation, provided that the complete Standard
# Version of the interpreter is so embedded.
#
# 6. The scripts and library files supplied as input to or produced as output
# from the programs of this Package do not automatically fall under the
# copyright of this Package, but belong to whomever generated them, and may be
# sold commercially, and may be aggregated with this Package. If such scripts or
# library files are aggregated with this Package via the so-called "undump" or
# "unexec" methods of producing a binary executable image, then distribution of
# such an image shall neither be construed as a distribution of this Package nor
# shall it fall under the restrictions of Paragraphs 3 and 4, provided that you
# do not represent such an executable image as a Standard Version of this
# Package.
#
# 7. C subroutines (or comparably compiled subroutines in other languages)
# supplied by you and linked into this Package in order to emulate subroutines
# and variables of the language defined by this Package shall not be considered
# part of this Package, but are the equivalent of input as in Paragraph 6,
# provided these subroutines do not change the language in any way that would
# cause it to fail the regression tests for the language.
#
# 8. Aggregation of this Package with a commercial distribution is always
# permitted provided that the use of this Package is embedded; that is, when no
# overt attempt is made to make this Package's interfaces visible to the end
# user of the commercial distribution. Such use shall not be construed as a
# distribution of this Package.
#
# 9. The name of the Copyright Holder may not be used to endorse or promote
# products derived from this software without specific prior written permission.
#
# 10. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
#
# The End