/* Ustawienie katalogu domowego (katalogu instalacji oprogramowania) */
%default HOME `echo \$HOME/Software/`

/* Avro używa json-simple i jest w bibliotekach piggybank; od wersji Pig 0.12 AvroStorage i TrevniStorage są elementami wbudowanymi */
REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.7.4.jar
REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar
REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar

DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
DEFINE LENGTH org.apache.pig.piggybank.evaluation.string.LENGTH();

REGISTER $HOME/varaha/lib/*.jar /* Varaha zawiwera dobry tokenizer */
REGISTER $HOME/varaha/target/varaha-1.0-SNAPSHOT.jar 

DEFINE TokenizeText varaha.text.TokenizeText('1', '1');

set default_parallel 20

rmf /tmp/tf_idf_scores.txt
rmf /tmp/ntf_idf_scores.txt
rmf /tmp/trimmed_tokens.txt

register 'udfs.py' using jython as funcs;
import 'ntfidf.macro';

/* Załadowanie e-maili i wycięcie niepotrzebnych pól */
emails = load '/me/Data/test_mbox' using AvroStorage();
-- emails = FILTER emails BY body IS NOT NULL;
id_body_address = foreach emails generate message_id, body, from.address as address;

/* Projekcja i spłaszczenie do trójek message_id/address/token i podstawowe filtrowanie */
token_records_address = foreach id_body_address generate message_id, address, FLATTEN(TokenizeText(body)) as token;
trimmed_tokens = filter token_records_address by token is not null and token != '' and LENGTH(token) > 2;
store trimmed_tokens into '/tmp/trimmed_tokens.txt';

/* Wyznaczenie wag wyrazów (tematów) dla poszczególnych wiadomości */
ntf_idf_scores_per_message = ntf_idf(trimmed_tokens, 'message_id', 'token');
store ntf_idf_scores_per_message into '/tmp/ntf_idf_scores_per_message.txt';

/* Wyznaczenie wag wyrazów (tematów) dla poszczególnych adresów e-mail */
ntf_idf_scores_per_address = ntf_idf(trimmed_tokens, 'address', 'token');
store ntf_idf_scores_per_address into '/tmp/ntf_idf_scores_per_address.txt';
