#!/usr/local/bin/perl
#############################################################
# Plik: inlinksAnalysis.pl #
# Opis: Ten skrypt analizuje linki zwrotne pozyskane Yahoo! #
# z pliku TSV usługi Yahoo! #
#############################################################
use LWP::Simple;
use LWP::UserAgent;
use HTML::TokeParser;
my @URLs = ();
#Pobieranie wejściowego parametru – nazwy pliku
my $fileToProcess = $ARGV[0];
my $baseurl = $ARGV[1];
print "\nPrzetwarzanie: $fileToProcess";
my $cnt = 0;
# Otwórz plik
if (-e "$fileToProcess"){
open FILE, "$fileToProcess" or die $!;
while (<FILE>) {
my $line = $_;
my @fragments = split(/\t/, $line);
my $url = $fragments[1];
$URLs[$cnt] = $url;
$cnt++;
}
} else {
print "\nPlik ($fileToProcess) nie istnieje";
}
my $ua = new LWP::UserAgent;
my $res;
$ua->agent("My Crawler");
my %linkPopHash = ();
my %anchorPopHash = ();
for(my $i=0; $i<=$cnt; $i++) {
$res = $ua->get("$URLs[$i]",':content_file' => "temp.txt");
if (-e "temp.txt") {
my $p = HTML::TokeParser->new("temp.txt");
while (my $token = $p->get_tag("a")) {
#Pobieranie linku i tekstu kotwiczki
my $url = $token->[1]{href} || "-";
my $anchorText = $p->get_trimmed_text("/a");
$url =~ s/^\s+//g;
$url =~ s/\s+$//g;
my $text = $p->get_trimmed_text("/a");
if ($url =~ /$baseurl/i) {
#print "\n$Bazowy adres URL: $URLs[$i] LINK: $url";
if(exists $linkPopHash{$url}){
$linkPopHash{$url} = $linkPopHash{$url} + 1;
$anchorPopHash{$url} = $anchorText;
} else {
$linkPopHash{$url} = 1;
$anchorPopHash{$url} = $anchorText;
}
}
}
}
}
open (FP, '>report.txt');
foreach my $key ( sort { $linkPopHash{$b} <=> $linkPopHash{$a} }
keys %linkPopHash ) {
print FP "$key, $linkPopHash{$key}, \"$anchorPopHash{$key}\"\n";
}
close (FP);