Hatena::ブログ(Diary)

Qualia綴り - renewal

13-07-11-Thu

[R]SVMメモ

久しぶりにSVM使うことになりそうなので、1年くらい前にやったときのコードをメモ(チューニング方法思い出すために)

あと、訓練データセットのクラス比率を揃えるべき(水増しとか重みづけとかで)

# SVM作成 
# 交差検定(10分割)
# 対象の値動きの方向性に着目した経済時系列予測へのサポートベクターマシンの応用に関する研究.pdf を参照
# SVMチューニング方法参考 http://d.hatena.ne.jp/hoxo_m/20110325/p1

library(e1071)

## データ入力
data_train = read.table('data_train.tsv',header=TRUE)
data_test = read.table('data_test.tsv',header=TRUE)

## SVM下準備
# 交差検定の分割数決定
1+log(327)/log(2)
# 結果は9.35 ただ、論文にしたがって10で実施
 
 # とりあえず、一回SVMしてみる
model = svm(Class ~ ., data=data_train, cross=10)

## グリッドサーチ
gammaRange = 10^(-5:5)
costRange = 10^(-2:2)
t = tune.svm(Class ~ ., data = data_train, gamma=gammaRange, cost=costRange, tunecontrol = tune.control(sampling="cross", cross=10))

#可視化
cat("- best parameters:\n")
cat("gamma =", t$best.parameters$gamma, "; cost =", t$best.parameters$cost, ";\n")
cat("accuracy:", 100 - t$best.performance * 100, "%\n\n")
plot(t, transform.x=log10, transform.y=log10)

# 細かくグリッドサーチ 1箇所目
gamma = 10^(-4)
cost = 10^(1)
gammaRange = 10^seq(log10(gamma)-1,log10(gamma)+1,length=11)[2:10]
costRange =10^seq(log10(cost)-1 ,log10(cost)+1 ,length=11)[2:10]
t1 = tune.svm(Class ~ ., data = data_train, gamma=gammaRange, cost=costRange, tunecontrol = tune.control(sampling="cross", cross=10))
cat("[gamma =", gamma, ", cost =" , cost , "]\n")
cat("- best parameters:\n")
cat("gamma =", t1$best.parameters$gamma, "; cost =", t1$best.parameters$cost, ";\n")
cat("accuracy:", 100 - t1$best.performance * 100, "%\n\n")
plot(t1, transform.x=log10, transform.y=log10)

# 細かくグリッドサーチ 2箇所目
gamma = 1e-5
cost = 100
gammaRange = 10^seq(log10(gamma)-1,log10(gamma)+1,length=11)[2:10]
costRange =10^seq(log10(cost)-1 ,log10(cost)+1 ,length=11)[2:10]
t2 = tune.svm(Class ~ ., data = data_train, gamma=gammaRange, cost=costRange, tunecontrol = tune.control(sampling="cross", cross=10))
cat("[gamma =", gamma, ", cost =" , cost , "]\n")
cat("- best parameters:\n")
cat("gamma =", t2$best.parameters$gamma, "; cost =", t2$best.parameters$cost, ";\n")
cat("accuracy:", 100 - t2$best.performance * 100, "%\n\n")
plot(t2, transform.x=log10, transform.y=log10)


# グリッドサーチ結果でモデル作成
gamma = 1e-04
cost = 63.09573
model = svm(Class~ ., data = data_train, gamma=gamma, cost=cost)

## 作ったモデルで予測
pred = predict(model, data_train, decision.values=TRUE)
table(pred, data_train$Class)

# テストデータに対して実施
data_test$pred  = predict(model, data_test, decision.values=TRUE)
data_test$dist = abs(data.frame(attr(data_test$pred, "decision.values"))$up.down)
table(data_test$pred, data_test$Class)
length(subset(data_test, pred==Class)[,1])/length(data_test[,1])

# 距離を信頼度として用いる
threshold = 0.5 # 下位50%は使用しない
sub = subset(data_test,dist>quantile(data_test$dist, threshold))
table(sub$pred, sub$Class)
length(subset(sub, pred==Class)[,1])/length(sub[,1])

# 分布確かめ
hit_curve = c()
data_train$pred  = predict(model, data_train, decision.values=TRUE)
data_train$dist = abs(data.frame(attr(data_train$pred, "decision.values"))$up.down)
for (ii in 1:100){
	threshold = ii/100
	sub = subset(data_train,dist>quantile(data_train$dist, threshold))
	hit_curve[ii] = length(subset(sub, pred==Class)[,1])/length(sub[,1])
}
plot(hit_curve)

# 分布確かめ テストデータ
hit_curve = c()
for (ii in 1:100){
	threshold = ii/100
	sub = subset(data_test,dist>quantile(data_test$dist, threshold))
	hit_curve[ii] = length(subset(sub, pred==Class)[,1])/length(sub[,1])
}
plot(hit_curve)

# 分割して、それぞれの内部で割合 幅0.1の移動平均
hit_curve = c()
for (ii in 1:100){
	threshold = ii/100
	sub = subset(subset(data_test,dist>=quantile(data_test$dist, threshold)), dist<quantile(data_test$dist, threshold+0.1))
	hit_curve[ii] = length(subset(sub, pred==Class)[,1])/length(sub[,1])
}
plot(hit_curve)


## 売買を行った場合のシミュレーション
# 終値で売買出来た場合
threshold = 0.9
amount = c(1)
for (ii in 1:(length(data_test[,1])-1)){
	if (data_test$pred[ii]=="up" & data_test$dist[ii]> quantile(data_test$dist, threshold)){
		amount = c(amount, rev(amount)[1]*data_test$Nikkei_Close[ii+1]/data_test$Nikkei_Close[ii])
	}
}
plot(amount)

threshold = 0.8
amount = c(1)
for (ii in 1:(length(data_test[,1])-1)){
	if (data_test$pred[ii]=="up" & data_test$dist[ii]< quantile(data_test$dist, threshold)){
		amount = c(amount, rev(amount)[1]*data_test$Nikkei_Close[ii+1]/data_test$Nikkei_Close[ii])
	}
}
plot(amount)

13-05-28-Tue

角度データの扱い

角度のデータは悩みだすと結構はまる。分散出すだけで一苦労だったのでメモ。

<参考ページ>

角度統計http://en.wikipedia.org/wiki/Directional_statistics

角度分散: http://www.ebi.ac.uk/thornton-srv/software/PROCHECK/nmr_manual/man_cv.html

[]Rで無名関数

ダメ元でググってみたら、普通にあって嬉しい

(function(arg){return(sum(arg,na.rm=T))})(c(1,2,3))
# [1] 6

つまりこんな書き方も可能。apply系列に使うには便利

tapply(dat$ang_sq,id_fac,(function(arg){return(sum(arg,na.rm=T))}))^2
# (オプションつけるだけなら、以下で十分)
tapply(dat$ang_sq,id_fac,sum,na.rm=T)^2

13-02-4-Mon

なにこの…なに

全部連想配列扱いっていう、やってることは分かるけど、javascriptきもちわるい

b=[]
// []
b[1.2]=1
// 1
b[1.2]=1
// 1
b[1.3]=1
// 1
b[1.4]=2
// 2
b[1.5]=2
// 2
b[1.6]=3
// 3
b
// []
b.length
// 0