#!/usr/bin/perl # Copyright (c) 2008 Rick Wash # # Permission to use, copy, modify, and/or distribute this software for any # purpose with or without fee is hereby granted, provided that the above # copyright notice and this permission notice appear in all copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # # # parse_all_url.pl # Rick Wash # # Download a ton of pages from del.icio.us/URL (or from a person's delicious page) # and save them as .html files. # Then, you can run this file: # - parse_all_url create # (creates the database tables _site and _tag) # - parse_all_url # (parses all of the .html files in the current and all sub-directories) use File::Find; # Config info $db_name = "delicious2007"; $db_user = "root"; $db_passwd = "********"; # Connect to the Database use DBI; $dbh = DBI->connect("dbi:mysql:$db_name", $db_user, $db_passwd); sub create_tables { ($sample) = @_; $site_sql = " CREATE TABLE `${sample}_site` ( `id` int(11) NOT NULL auto_increment, `deliciousID` varchar(200) NOT NULL default '', `title` varchar(400) default NULL, `url` varchar(500) default NULL, `user` varchar(200) default NULL, `date` date default NULL, `position` int(11) default NULL, PRIMARY KEY (`id`), KEY `deliciousID` (`deliciousID`), KEY `date` (`date`), KEY `position` (`position`), KEY `user` (`user`), KEY `id_date` (`deliciousID`,`date`) ) ENGINE=MyISAM DEFAULT CHARSET=latin1"; $tag_sql = " CREATE TABLE `${sample}_tag` ( `id` int(11) NOT NULL auto_increment, `site_id` int(11) default NULL, `tag` varchar(200) default NULL, `position` int(11) default NULL, PRIMARY KEY (`id`), KEY `tag` (`tag`), KEY `deliciousID` (`site_id`) ) ENGINE=MyISAM DEFAULT CHARSET=latin1"; $dbh->do($site_sql); $dbh->do($tag_sql); } sub fix_month { $_ = shift; s/Jan/01/g; s/Feb/02/g; s/Mar/03/g; s/Apr/04/g; s/May/05/g; s/Jun/06/g; s/Jul/07/g; s/Aug/08/g; s/Sep/09/g; s/Oct/10/g; s/Nov/11/g; s/Dec/12/g; return $_; } $first = shift; # first parameter from command line if ($first eq "create") { $sample = shift; create_tables($sample); exit(0); } else { $sample = $first; } $site_sql = "INSERT INTO ${sample}_site (deliciousID, title, url, user, date, position) VALUES (?, ?, ?, ?, ?, ?)"; $site_sth = $dbh->prepare($site_sql); $meta_sql = "INSERT INTO ${sample}_tag (site_id, tag, position) VALUES (?, ?, ?)"; $meta_sth = $dbh->prepare($meta_sql); # First pass through the file. Find the Title, URL, and total number of posts sub pass_one() { # Get the Site ID number from the title if (/del.icio.us\/url\/([^<]*)<\/title>/) { $site_id = $1; } if (/<h4 class="nomb"><a href="([^\"]*)" rel="nofollow">([^<]*)<\/a><\/h4>/){ $title = $2; $url = $1; } if (/this url has been saved by (\d*) people/) { $total_count = $1 } } # Pass two through the file. Find the individual posts and store them in the database sub pass_two() { if (/<h5 class="datehead">(...) ‘(\d\d)<\/h5>/) { $mo = fix_month($1); if ($2 < 90) { $date = "20$2-${mo}-01 00:00:00"; } else { $date = "19$2-${mo}-01 00:00:00"; } } if (/<li><p>by/) { # Found entry @tags = (); $order = 1; if (/who" href="[^\\"]*">([^<]*)/) { $user = $1; } while (/(to) <a href="[^"]*">([^<]*)<\/a>(.*)/) { push @tags, [ $2, $order ]; $_ = $1 . $3; # Remove the matched section $order += 1; } $site_sth->execute($site_id, $title,$url,$user,$date, $number--) || warn "SQL Error Inserting into site"; $id = $dbh->last_insert_id(undef, undef, undef, undef); foreach my $i (@tags) { ($tag, $order) = @{$i}; $meta_sth->execute($id, $tag, $order) || warn "SQL Error Inserting into metadata"; } } } sub parse_file { my ($fname) = @_; $number = 1; # First pass.... open(INFILE, "<", $fname) || warn("Cannot open $fname: $!"); # Loop through the file, reading one line at a time while(<INFILE>) { pass_one() } close INFILE; $number = $total_count; # Second Pass open(INFILE, "<", $fname) || warn("Cannot open $fname: $!"); # Loop through the file, reading one line at a time while(<INFILE>) { pass_two() } close INFILE; } sub check_file { if (!(/^.*\.html$/)) { return; } parse_file($_); } find(\&check_file, ".");