#!/usr/bin/perl
use warnings;
use strict;

# Log2Sitemap - convert Apache  log files in the "combined" log format 
# to google sitemaps
# Copyright (C) 2006 by Moritz Lenz
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
#
####### EDIT ##########
#
# Set this value to your domain name, inccuding http:// at the beginning
# but no trailing slash:
my $host = "http://moritz.faui2k3.org";
#
# 
# Set this to your desired output filename
my $out_file = "sitemap.xml";
#
#
######## END EDIT ######
#
# Don't edit anything beyond this point, unless you really know what you are
# doing

sub url_encode($);
my %urls;

while(<>){
	chomp;
	if (m/^([^ ]*) ([^ ]*) ([^ ]*) \[([^\]]+)\] "((?:[^"]|\\")*?)" (\d+) (-|\d+) "((?:[^"]|\\")*)" "((?:[^"]|\\")*)"/){
		my ($ip, $ignore, $remote_user, $date, $request, $status, 
				$size, $ref, $ua) 
			= ($1, $2, $3, $4, $5, $6, $7, $8, $9);
		if ($status != 200){
			next;
		}
		if ($request =~ m/^([A-Z]+) (.*?) /){
			my $method = $1;
			my $url = $2;
			# if $url is an image, rate it lower
			if ($url =~ m#(?:css|jpg|jpeg|png|gif|js|ico)$#){
				$urls{$url} += 0.2;
			} else {
				$urls{$url}++;
			}
		}
	}

}

# sort list of files

my @f = sort {$urls{$b} <=> $urls{$a}} keys %urls;

my $max = $urls{$f[0]};

# write xml sitemap
open (OUT, ">", $out_file) or die "Can't open '$out_file' for writing: $!";
print OUT  <<XML;
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.google.com/schemas/sitemap/0.84">

XML
foreach (@f){
	print  OUT "\t<url>\n";
	print  OUT "\t\t<loc>", $host, url_encode($_), "</loc>\n";
	printf  OUT "\t\t<priority>%.5f</priority>\n", $urls{$_} / $max;
	print  OUT "\t</url>\n";
}
print OUT "\n</urlset>\n";


sub url_encode($) {
	my $url = shift;
	$url =~ s#&#&amp;#g;
	$url =~ s#'#&quot;#g;
	$url =~ s#<#&lt;#g;
	$url =~ s#>#&gt;#g;

#$url =~ s#([\x80-\xFF])#sprintf "%x", ord($1)#eg;
	$url =~ s#([^a-zA-Z0-9-/&;:_.])#sprintf "%%%x", ord($1)#eg;
	return $url;
}
