Complement Naive BayesをPHPで実装した

アルゴリズムの理解を深める為、Rubyで書かれた実装をそのままPHPに移植してみた。
 
Complement Naive Bayes らしきものをRubyで書いた - 記録用
http://d.hatena.ne.jp/laughing/20101114/1289698415
 

<?php

class CNB
{
	public function __construct($smoothing_parameter=1)
	{
		$this->frequency_of_word_by_class = array();
		$this->number_of_training_data_of_class = array();
		$this->smoothing_parameter = $smoothing_parameter;
	}

	public function training($label, $sosei)
	{
		if (!isset($this->frequency_of_word_by_class[$label])) {
			$this->frequency_of_word_by_class[$label] = array();
		}

		foreach ($sosei as $k=>$v) {
			if (!isset($this->frequency_of_word_by_class[$label][$k])) {
				$this->frequency_of_word_by_class[$label][$k] = 0;
			}
			$this->frequency_of_word_by_class[$label][$k] += $v;
		}

		if (!isset($this->number_of_training_data_of_class[$label])) {
			$this->number_of_training_data_of_class[$label] = 0;
		}
		$this->number_of_training_data_of_class[$label]++;
	}

	public function total_number_of_word_in_other_class($c)
	{
		$all_words = array();
		foreach ($this->frequency_of_word_by_class as $k=>$v) {
			$all_words = array_merge($all_words, array_keys($v));
		}
		$all_words = array_unique($all_words);

		$other_classes = array_keys($this->frequency_of_word_by_class);
		$len = count($other_classes);
		for ($i=0; $i<$len; $i++) {
			if ($other_classes[$i]==$c) {
				unset($other_classes[$i]);
			}
		}
		$other_classes = array_values($other_classes);

		$result = 0;
		foreach ($other_classes as $c) {
			foreach ($all_words as $w) {
				if (isset($this->frequency_of_word_by_class[$c][$w])) {
					$result += $this->frequency_of_word_by_class[$c][$w];
				}
			}
		}
		return $result;
	}

	public function number_of_word_in_other_class($c, $w)
	{
		$other_classes = array_keys($this->frequency_of_word_by_class);
		$len = count($other_classes);
		for ($i=0; $i<$len; $i++) {
			if ($other_classes[$i]==$c) {
				unset($other_classes[$i]);
			}
		}
		$other_classes = array_values($other_classes);

		$result = 0;
		foreach ($other_classes as $c) {
			if (isset($this->frequency_of_word_by_class[$c][$w])) {
				$result += $this->frequency_of_word_by_class[$c][$w];
			}
		}
		return $result;
	}

	public function classifier($sosei)
	{
		$all_class = array_keys($this->frequency_of_word_by_class);
		$all_training_data = array_sum(array_values($this->number_of_training_data_of_class));

		$result = array();
		foreach ($all_class as $c) {
			$n_c = $this->total_number_of_word_in_other_class($c);
			$alpha = $this->smoothing_parameter * count($sosei);

			$term2nd = 0.0;
			foreach ($sosei as $k=>$v) {
				$term2nd += $v * log(($this->number_of_word_in_other_class($c, $k) + $this->smoothing_parameter) / ($n_c + $alpha));
			}

			$theta_c = $this->number_of_training_data_of_class[$c] / $all_training_data;
			$result[] = array($c, log($theta_c) - $term2nd);
		}
		return $result;
	}
}


function read_file($fname)
{
	$txt = file_get_contents($fname);
	$lines = preg_split("/[\r\n]+/", $txt);
	$result = array();
	foreach ($lines as $line) {
		if (strlen($line)) {
			list($label, $sosei_txt) = explode(" ", $line, 2);
			$sosei = array();
			$sosei_kv_arr = explode(" ", $sosei_txt);
			foreach ($sosei_kv_arr as $kv) {
				list($k, $v) = explode(":", $kv, 2);
				$sosei[$k] = $v;
			}
			$result[$label] = $sosei;
		}
	}
	return $result;
}

$cnb = new CNB();

$train = read_file($argv[1]);
$test = read_file($argv[2]);

foreach ($train as $label=>$sosei) {
	$cnb->training($label, $sosei);
}
foreach ($test as $label=>$sosei) {
	print_r($cnb->classifier($sosei));
}

 
Dropboxへのリンクが404になっているけど、多分以下のようなフォーマット。

ラベル1 ワード1:ワード1の出現数 ワード2:ワード2の出現数 ワード3:ワード3の出現数 …

実物はこんな。

label1 aaa:1 bbb:2 ccc:3