Hatena::ブログ(Diary)

橋本詳解 RSSフィード

ここはメモ帳です

http://shokai.org
http://shokai.org/blog/

2008-12-21

[][][]bot ahokaiの作成

さっきできた。3時間ぐらいかかった。 http://twitter.com/ahokai



search on google

に触発されて作った

いつかどこかでn-gramモデルを作って文章を連結させると自然になると読んだので、やってみた。

DBに溜めたpostをmecabで分割する所で文字コード関係でつまづいていたが、色々やってたらなんとかなった。

口調が完全に自分と同じなのできもい。


まず自分のpostを収集するDBを作成

migrate_posts.rb

#!/usr/bin/env ruby
require 'rubygems'
require 'active_record'
require File.dirname(__FILE__) + "/model_post.rb"

ActiveRecord::Base.establish_connection(
                                        :adapter => 'sqlite3',
                                        #:dbfile => ':memory:',
                                        :dbfile => 'db_posts',
                                        :timeout => 30000
                                        )

class PostMigration < ActiveRecord::Migration
  def self.up
    create_table(:posts){|t|
      t.string :message, :null => false
      t.string :uri, :null => false
      t.time :time, :null => false
    }
  end

  def self.down
    drop_table :posts
  end
end

if ARGV.size < 1 || (ARGV[0]!="up" && ARGV[0]!="down")
  begin
    Post.find(:all).each{ |post|
      puts post.to_s
    }
  rescue
    puts "couldn't connect dbfile"
  end
  puts 'usage: "ruby migrate.rb up"  or  "ruby migrate.rb down"'
  exit(1)
end

PostMigration.migrate(ARGV[0])

activerecordのmodel

model_post.rb

class Post < ActiveRecord::Base
  def to_s
    return  "#{time} #{message} #{uri}"
  end
end

収集スクリプト。3200件までは取得できた。

store.rb

#!/usr/bin/env ruby
require 'rubygems'
require 'active_record'
require 'feed-normalizer'
require 'open-uri'
require 'kconv'
require File.dirname(__FILE__) + "/model_post.rb"

user = "3631571" # twitterID

ActiveRecord::Base.establish_connection(
                                        :adapter => 'sqlite3',
                                        #:dbfile => ':memory:',
                                        :dbfile => 'db_posts',
                                        :timeout => 30000
                                        )


for page in 1..160
  uri = "http://twitter.com/statuses/user_timeline/#{user}.atom?page=#{page}"
  feed = FeedNormalizer::FeedNormalizer.parse open(uri)

  puts uri
  feed.entries.each{ |e|
    if Post.find_by_uri(e.url) == nil
      post = Post.create(:uri => e.url,
                  :message => e.content.gsub(/&#(?:(\d*?)|(?:[xX]([0-9a-fA-F]{4})));/) { [$1.nil? ? $2.to_i(16) : $1.to_i].pack('U') },
                  :time => e.last_updated
                  )
      puts post
    end
  }
  sleep 10

end

n-gramを作るためのDBのmigration用

migrate_ngrams.rb

#!/usr/bin/env ruby
require 'rubygems'
require 'active_record'
require File.dirname(__FILE__) + "/model_ngram.rb"

ActiveRecord::Base.establish_connection(
                                        :adapter => 'sqlite3',
                                        #:dbfile => ':memory:',
                                        :dbfile => 'db_ngrams',
                                        :timeout => 30000
                                        )

class NgramMigration < ActiveRecord::Migration
  def self.up
    create_table(:ngrams){|t|
      t.string :a, :null => false
      t.string :b, :null => false
      t.string :c, :null => false
      t.column :count, :int, :null => false
    }
  end
  def self.down
    drop_table :ngrams
  end
end

if ARGV.size < 1 || (ARGV[0]!="up" && ARGV[0]!="down")
  begin
    Ngram.find(:all).each{ |ng|
     puts ng.to_s
    }
  rescue
    puts "couldn't connect dbfile"
  end
  puts 'usage: "ruby migrate_ngrams.rb up"  or  "ruby migrate_ngrams.rb down"'
  exit(1)
end
puts ARGV[0]
NgramMigration.migrate(ARGV[0])

今回は3-gramにする。active_recordのmodel。

model_ngram.rb

class Ngram < ActiveRecord::Base
  def to_s
    return  "#{a} #{b} #{c} #{count}"
  end
end

収集した3000ぐらいのpostからn-gramモデルを作成するスクリプト

make3gram.rb

#!/usr/bin/env ruby
require 'rubygems'
require 'MeCab'
require 'active_record'
require 'kconv'
require File.dirname(__FILE__) + "/model_post.rb"
require File.dirname(__FILE__) + "/model_ngram.rb"
#$KCODE = 'UTF8'


ActiveRecord::Base.establish_connection(
                                        :adapter => 'sqlite3',
                                        #:dbfile => ':memory:',
                                        :dbfile => 'db_posts',
                                        :timeout => 30000
                                        )

mecab = MeCab::Tagger.new('-Ochasen')

messages = Post.find(:all).map{ |post|
  post.message
}

ActiveRecord::Base.establish_connection(
                                        :adapter => 'sqlite3',
                                        #:dbfile => ':memory:',
                                        :dbfile => 'db_ngrams',
                                        :timeout => 30000
                                        )

messages.each{ |message|
  puts message
  parsed = mecab.parse(message.gsub("shokai: ",""))
  words = Array.new
  parsed.each{|s|
    w = s.split(/\t/)[0]
    words.push(w) if !(w =~ /EOS/)
  }
  
  # 3-gramを作成
  for n in 0..words.size-3
    a,b,c = words[n..n+2] # 3-gram
    puts a+b+c
    ng = Ngram.find(:first, :conditions => ["a=? and b=? and c=?", a, b, c])
    if ng != nil
      ng.count += 1
      ng.save
    else
      Ngram.create(:a => a,
                   :b => b,
                   :c => c,
                   :count => 1)
    end
  end

}

作成中

making n-gram model


できた3-gramを連結させて文章を作ってtwitterに投稿するスクリプト

post3gram.rb

#!/usr/bin/env ruby
require 'rubygems'
require 'MeCab'
require 'active_record'
require 'kconv'
gem 'twitter'
require 'twitter'
require File.dirname(__FILE__) + "/model_ngram.rb"
#$KCODE = 'UTF8'

user = "username"
pass = "password"
ActiveRecord::Base.establish_connection(
                                        :adapter => 'sqlite3',
                                        #:dbfile => ':memory:',
                                        :dbfile => File.dirname(__FILE__) + '/db_ngrams',
                                        :timeout => 30000
                                        )



ngs = Ngram.find(:all)
head = ngs[rand(ngs.size)]
puts head.to_s
results = head.a+head.b+head.c

50.times do
  begin
    ngs = Ngram.find(:all, :conditions => ["a=? and b=?",head.b , head.c])
    next_ng = ngs[rand(ngs.size)]
    puts next_ng.to_s
    results+=next_ng.c
    head = next_ng
  rescue
    break
  end
end


twit = Twitter::Base.new(user, pass)
twit.update(results.toutf8)

puts results

スパム対策のためのダミーです。もし見えても何も入力しないでください
ゲスト


画像認証

トラックバック - http://d.hatena.ne.jp/shokai/20081221/1229879343