12. UDTF (parameter-mix)
HadoopのInputSplitSizeの設定に応じたmapperが
select 立ち上がる(map-only)
feature,
CAST(avg(weight) as FLOAT) as weight
from
( select
TrainLogisticSgdUDTF(features,label,..) as (feature,weight)
from train
)t
group by feature;
どうやってiterative parameter mixさせよう???
古いmodelを渡さないといけない
毎行渡すのはあれだし…
12
13. UDTF(iterative parameter mix)
create table model1sgditor2 as
select
feature,
CAST(avg(weight) as FLOAT) as weight
from (
select
TrainLogisticIterUDTF(t.features, w.wlist, t.label, ..)
as (feature, weight)
from
training t join feature_weight w on (t.rowid = w.rowid)
)t
group by feature;
ここで必要なのは、各行の素性ごとに古いModel
Map<feature, weight>, label相当を渡せばよいので、
Array<feature>に対応するArray<weight>をテーブルを作って
inner joinで渡す
13
14. Pig版のフローの一例
training_raw = load '$TARGET' as (clicks: int, impression: int, displayid: int, adid: int, advertiserid: int, depth: int, position: int, queryid: int, keywordid: int,
titleid: int, descriptionid: int, userid: int, gender: int, age: int);
training_bin = foreach training_raw generate flatten(predictor.ctr.BinSplit(clicks, impression)), displayid, adid, advertiserid, depth, position, queryid,
keywordid, titleid, descriptionid, userid, gender, age;
training_smp = sample training_bin 0.1;
training_rnd = foreach training_smp generate (int)(RANDOM() * 100) as dataid, TOTUPLE(*) as training;
training_dat = group training_rnd by dataid;
model = foreach training_dat generate predictor.ctr.TrainLinear(training_rnd.training.training_smp);
store model into '$MODEL';
model = load '$MODEL' as (mdl: map[]);
弱学習
model_lmt = limit model 10;
testing_raw = load '$TARGET' as (dataid: int, displayid: int, adid: int, advertiserid: int, depth: int, position: int, queryid: int, keywordid: int, titleid: int,
descriptionid: int, userid: int, gender: int, age: int);
testing_with_model = cross model_lmt, testing_raw;
result = foreach testing_with_model generate dataid, predictor.ctr.Pred(mdl, displayid, adid, advertiserid, depth, position, queryid, keywordid, titleid,
descriptionid, userid, gender, age) as ctr;
result_grp = group result by dataid;
result_ens = foreach result_grp generate group as dataid, predictor.ctr.Ensemble(result.ctr);
result_ens_ord = order result_ens by dataid;
result_fin = foreach result_ens_ord generate $1;
store result_fin into '$RESULT';
アンサンブル学習
14