/* Ustawienie katalogu domowego (katalogu instalacji oprogramowania) */
%default HOME `echo \$HOME/Software/`

/* Avro używa biblioteki json-simple i jest w bibliotekach piggybank; od wersji Pig 0.12 AvroStorage i TrevniStorage są elementami wbudowanymi */
REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar
REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar
REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar

DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
DEFINE substr org.apache.pig.piggybank.evaluation.string.SUBSTRING();
DEFINE tohour org.apache.pig.piggybank.evaluation.datetime.truncate.ISOToHour();

/* Biblioteki i konfiguracja MongoDB */
REGISTER $HOME/mongo-hadoop/mongo-2.10.1.jar
REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar
REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar

DEFINE MongoStorage com.mongodb.hadoop.pig.MongoStorage();

set default_parallel 5
set mapred.map.tasks.speculative.execution false
set mapred.reduce.tasks.speculative.execution false

/* Makrodefinicja wyłuskująca godzinę z zapisu daty/czasu w formacie iso8601 */
define extract_time(relation, field_in, field_out) RETURNS times {
  $times = foreach $relation generate flatten($field_in.(address)) as $field_out,
                             substr(tohour(date), 11, 13) as sent_hour;
};

register 'udfs.py' using jython as funcs;

rmf /tmp/sent_distributions.avro

emails = load '/me/Data/test_mbox' using AvroStorage();
filtered = filter emails BY (from is not null) and (date is not null);

/* Musimy uznać adresy z reply_to jako pełnoprawne adresy nadawców,
  inaczej ryzykujemy występowanie wiadomości bez nadawców */
split filtered into has_reply_to if (reply_tos is not null), froms if (reply_tos is null);

/* Jeśli wiadomość ma ustawione pola from i reply_to, zliczamy oba adresy */
reply_to = extract_time(has_reply_to, reply_tos, from);
reply_to_froms = extract_time(has_reply_to, from, from);
froms = extract_time(froms, from, from);
all_froms = union reply_to, reply_to_froms, froms;

pairs = foreach all_froms generate LOWER(from) as sender_email_address, 
                                   sent_hour;

sent_times = foreach (group pairs by (sender_email_address, sent_hour)) generate flatten(group) as (sender_email_address, sent_hour), 
                                                                                 COUNT_STAR(pairs) as total;

/* Zwróćmy uwagę na użycie 'sort' wewnątrz bloku 'foreach' */
sent_distributions = foreach (group sent_times by sender_email_address) { 
    solid = filter sent_times by (sent_hour is not null) and (total is not null);
    sorted = order solid by sent_hour;
    generate group as address, sorted.(sent_hour, total) as sent_distribution;
};

filled_dist = foreach sent_distributions generate address, funcs.fill_in_blanks(sent_distribution) as sent_distribution;
                                                        
-- store filled_dist into '/tmp/sent_distributions.avro' using AvroStorage();
store filled_dist into 'mongodb://localhost/agile_data.sent_distributions' using MongoStorage();

