rubyでクローラ(hatenaのURL取得)

  • rubyでクローラ
    • hatenaダイアリーの2018年の全記事のURL取得
require 'anemone'

urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/archive']
Anemone.crawl(urls,:delay=>3,:depth_limit=>2) do |anemone|

        anemone.focus_crawl do |page|
                #page.links.keep_if{|link| link.to_s.match(/archive\/2[0-9][0-9].*/)}

                page.links.keep_if{|link| link.to_s.match(/.*\/2018[0-9][0-9].*/)}
                #page.links.keep_if{|link| link.to_s.match(/.*\/2[0-9][0-9].*/)}
        end

        anemone.on_every_page do |page|
                puts page.url
                #puts page.body
        end


end
#-*- coding: utf-8
require 'anemone'
require 'nokogiri'
id1="red_9512k"
urls=["http://d.hatena.ne.jp/#{id1}/archive"]
#
#urls=["http://d.hatena.ne.jp/#{id1}/archive/200512"]
f_out0=open("data_list/#{id1}.dat","w")
Dir.mkdir("data_source/#{id1}") unless FileTest.exists?("data_source/#{id1}")
#exit()
#
#
#urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/20170120']
#urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/archive']
Anemone.crawl(urls,:delay=>30,:depth_limit=>2) do |anemone|

        anemone.focus_crawl do |page|
                #page.links.keep_if{|link| link.to_s.match(/archive\/2[0-9][0-9].*/)}

                puts page.links()
                page.links.keep_if{|link| link.to_s.match(/.*\/[0-9][0-9][0-9][0-9][0-9][0-9].*/)}
                #page.links.keep_if{|link| link.to_s.match(/.*\/2005[0-9][0-9].*/)}
                #page.links.keep_if{|link| link.to_s.match(/.*\/2017[0-9][0-9].*/)}
                #page.links.keep_if{|link| link.to_s.match(/.*\/2[0-9][0-9].*/)}
        end

        #anemone.on_every_page do |page|
        #
        #       puts page.url
        #       #html=Nokogiri(page.body)
        #end
        anemone.on_pages_like(/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$/) do |page|
        #anemone.on_pages_like(/20170120$/) do |page|

                puts page.url
                page2=Nokogiri::HTML.parse(page.body)
                v=page2.css("div.body")
                url3=""
                time1=""
                title1=""
                text1_con=""
                time1=page2.css('span.date').text()
                v.css('div.section').each{|j|
                        if not j['id'].to_s.match(/google/) then
                                j.css("h3").each{|k|
                                        title1=k.text.to_s.encode('utf-8')
                                        url3=k.css('a')[0]['href']
                                        #puts url3
                                }
                                #j.css("p.sectionfooter").each{|k|
                                #
                                #       str1=k.text.split("|")[3]
                                #       if not str1.nil? then
                                #               time1=str1.gsub(/\s/,'')
                                #
                                #       end
                                #}

                                #j.css("p.sectionfooter").each{|k|
                                #
                                #       str1=k.text.split("|")[3]
                                #       if not str1.nil? then
                                #               time1=str1.gsub(/\s/,'')
                                #
                                #       end
                                #}
                                text_main1=j.to_s.encode('utf-8')
                                #puts text_main1
                                #
                                #text1_con=j.text().encode('utf-8').gsub(/\n/,'__SEPSEP__')
                                #text1_con=j.css('pre').text().gsub(/\n/,'__SEPSEP__')
                                text1_con0=j
                                text1_con0.css('a.sectioncategory').remove

                                text1_con=text1_con0.text().gsub(/\n/,'__SEPSEP__')


                        ########puts url3
                                filename=url3.to_s.gsub(/^\//,'').gsub('/','_PVSE_')

                                filename="data_source/#{id1}/"+filename+".dat"
                                f_out=open(filename,"w")
                                f_out.write(text_main1)
                                f_out.close()
                                write_data=url3+"\t"+time1+"\t"+title1+"\t"+text1_con
                                puts write_data
                                f_out0.puts write_data
                        end
                }
                #exit()
        end



end

f_out0.close()
  • 関数化
#-*- coding: utf-8
require 'anemone'
require 'nokogiri'
def crowle_h(id1)
        #id1="red_9512k"
        #urls=['http://d.hatena.ne.jp/red_9512k/archive']
        #
        urls=["http://d.hatena.ne.jp/#{id1}/archive"]
        #
        #urls=["http://d.hatena.ne.jp/#{id1}/archive/200512"]
        f_out0=open("data_list/#{id1}.dat","w")
        Dir.mkdir("data_source/#{id1}") unless FileTest.exists?("data_source/#{id1}")
        #exit()
        #
        #
        #urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/20170120']
        #urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/archive']
        Anemone.crawl(urls,:delay=>3,:depth_limit=>20) do |anemone|

                anemone.focus_crawl do |page|
                        #page.links.keep_if{|link| link.to_s.match(/archive\/2[0-9][0-9].*/)}

                        puts page.links()
                        page.links.keep_if{|link| link.to_s.match(/.*\/[0-9][0-9][0-9][0-9][0-9][0-9].*/)}
                        #page.links.keep_if{|link| link.to_s.match(/.*\/2005[0-9][0-9].*/)}
                        #page.links.keep_if{|link| link.to_s.match(/.*\/2017[0-9][0-9].*/)}
                        #page.links.keep_if{|link| link.to_s.match(/.*\/2[0-9][0-9].*/)}
                end

                #anemone.on_every_page do |page|
                #
                #       puts page.url
                #       #html=Nokogiri(page.body)
                #end
                anemone.on_pages_like(/[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$/) do |page|
                #anemone.on_pages_like(/20170120$/) do |page|

                        puts page.url
                        page2=Nokogiri::HTML.parse(page.body)
                        v=page2.css("div.body")
                        url3=""
                        time1=""
                        title1=""
                        text1_con=""
                        time1=page2.css('span.date').text()
                        v.css('div.section').each{|j|
                                if not j['id'].to_s.match(/google/) then
                                        j.css("h3").each{|k|
                                                title1=k.text.to_s.encode('utf-8')
                                                url3=k.css('a')[0]['href']
                                                #puts url3
                                        }
                                        #j.css("p.sectionfooter").each{|k|
                                        #
                                        #       str1=k.text.split("|")[3]
                                        #       if not str1.nil? then
                                        #               time1=str1.gsub(/\s/,'')
                                        #
                                        #       end
                                        #}

                                        #j.css("p.sectionfooter").each{|k|
                                        #
                                        #       str1=k.text.split("|")[3]
                                        #       if not str1.nil? then
                                        #               time1=str1.gsub(/\s/,'')
                                        #
                                        #       end
                                        #}
                                        text_main1=j.to_s.encode('utf-8')
                                        #puts text_main1
                                        #
                                        #text1_con=j.text().encode('utf-8').gsub(/\n/,'__SEPSEP__')
                                        #text1_con=j.css('pre').text().gsub(/\n/,'__SEPSEP__')
                                        text1_con0=j
                                        text1_con0.css('a.sectioncategory').remove

                                        text1_con=text1_con0.text().gsub(/\n/,'__SEPSEP__')


                                ########puts url3
                                        filename=url3.to_s.gsub(/^\//,'').gsub('/','_PVSE_')

                                        filename="data_source/#{id1}/"+filename+".dat"
                                        f_out=open(filename,"w")
                                        f_out.write(text_main1)
                                        f_out.close()
                                        write_data=url3+"\t"+time1+"\t"+title1+"\t"+text1_con
                                        puts write_data
                                        f_out0.puts write_data
                                end
                        }
                        #exit()
                end



        end
        f_out0.close()
        #gc()
end
id1="tomokk81"
crowle_h(id1)

上のは無駄にアクセスしすぎて問題があるので改定版(2018.02.25)
anemone.focus_crawlの利用して移動さける
https://qiita.com/yoshiokaCB/items/99ac16aba790781c5092
https://qiita.com/tady/items/8a954dfcd03521f8a200

#-*- coding: utf-8
require 'anemone'
require 'nokogiri'
def crowle_h(id1)
        #id1="red_9512k"
        #urls=['http://d.hatena.ne.jp/red_9512k/archive']
        #
        urls=["http://d.hatena.ne.jp/#{id1}/archive"]
        #
        #urls=["http://d.hatena.ne.jp/#{id1}/archive/200512"]
        f_out0=open("data_list/#{id1}.dat","w")
        Dir.mkdir("data_source/#{id1}") unless FileTest.exists?("data_source/#{id1}")
        #exit()
        #
        #
        #urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/20170120']
        #urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/archive']
        #Anemone.crawl(urls,:delay=>3,:depth_limit=>2) do |anemone|
        #Gets last url number
        #
        url_list=[]
        1.times.each{
                Anemone.crawl(urls,:delay=>3,:depth_limit=>30) do |anemone|
                        anemone.focus_crawl do |page|

                                page.links.keep_if{|link|
                                        link.to_s.match(/word=\&of=[0-9][0-9]*/)

                                }
                        end
                        anemone.on_every_page do |page|


                                puts page.url
                                #puts page.links
                                #links=page.links.select{|i| i.to_s.match(/.*hatena.ne.jp*\/[0-9][0-9]*\/[0-9][0-9]*/)}
                                #
                                links=page.links.select{|i| i.to_s.match(/.*hatena.ne.jp.*\/[0-9][0-9]*\/[0-9][0-9]*/)}
                                #puts links
                                url_list=url_list+(links.map{|i| i.to_s})


                        end

                end
                url_list.uniq!.sort!
                puts url_list
                f1=open("data_list/url_list_"+id1+".csv","w")
                url_list.each{|i| f1.puts(i.chomp)}
                #exit()

        }

        0.times.each{

                Anemone.crawl(url_list,:delay=>3,:depth_limit=>1) do |anemone|


                        anemone.on_every_page do |page|

                                puts page.url

                        end

                end

        }

        #exit()
        1.times.each{

                Anemone.crawl(url_list,:delay=>3,:depth_limit=>1) do |anemone|

                        anemone.on_every_page do |page|
                                puts page.url
                                page2=Nokogiri::HTML.parse(page.body)
                                v=page2.css("div.body")
                                url3=""
                                time1=""
                                title1=""
                                text1_con=""
                                time1=page2.css('span.date').text()
                                v.css('div.section').each{|j|
                                        if not j['id'].to_s.match(/google/) then
                                                j.css("h3").each{|k|
                                                        title1=k.text.to_s.encode('utf-8')
                                                        url3=k.css('a')[0]['href']
                                                                        #puts url3
                                                }
                                                #j.css("p.sectionfooter").each{|k|
                                                #
                                                #       str1=k.text.split("|")[3]
                                                #       if not str1.nil? then
                                                #               time1=str1.gsub(/\s/,'')
                                                #
                                                #       end
                                                #}

                                                #j.css("p.sectionfooter").each{|k|
                                                #
                                                #       str1=k.text.split("|")[3]
                                                #       if not str1.nil? then
                                                #               time1=str1.gsub(/\s/,'')
                                                #
                                                #       end
                                                #}
                                                text_main1=j.to_s.encode('utf-8')
                                                #puts text_main1
                                                #
                                                #text1_con=j.text().encode('utf-8').gsub(/\n/,'__SEPSEP__')
                                                #text1_con=j.css('pre').text().gsub(/\n/,'__SEPSEP__')
                                                text1_con0=j
                                                text1_con0.css('a.sectioncategory').remove

                                                text1_con=text1_con0.text().gsub(/\n/,'__SEPSEP__')


                                        ########puts url3
                                                filename=url3.to_s.gsub(/^\//,'').gsub('/','_PVSE_')

                                                filename="data_source/#{id1}/"+filename+".dat"
                                                f_out=open(filename,"w")
                                                f_out.write(text_main1)
                                                f_out.close()
                                                write_data=url3+"\t"+time1+"\t"+title1+"\t"+text1_con
                                                puts write_data
                                                f_out0.puts write_data
                                        end
                                }
                                #exit()
                #end
                        end
                end

        }
        f_out0.close()
        #gc()
end

さらにバグがあるので、改良 日付のみのページに対応

#-*- coding: utf-8
require 'anemone'
require 'nokogiri'
def crowle_h(id1)
        #id1="red_9512k"
        #urls=['http://d.hatena.ne.jp/red_9512k/archive']
        #
        urls=["http://d.hatena.ne.jp/#{id1}/archive"]
        #
        #urls=["http://d.hatena.ne.jp/#{id1}/archive/200512"]
        f_out0=open("data_list/#{id1}.dat","w")
        Dir.mkdir("data_source/#{id1}") unless FileTest.exists?("data_source/#{id1}")
        #exit()
        #
        #
        #urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/20170120']
        #urls=['http://d.hatena.ne.jp/arupaka-_-arupaka/archive']
        #Anemone.crawl(urls,:delay=>3,:depth_limit=>2) do |anemone|
        #Gets last url number
        #
        url_list=[]
        1.times.each{
                Anemone.crawl(urls,:delay=>3,:depth_limit=>1000) do |anemone|
                        anemone.focus_crawl do |page|

                                page.links.keep_if{|link|
                                        link.to_s.match(/word=\&of=[0-9][0-9]*/)

                                }
                        end
                        anemone.on_every_page do |page|


                                puts page.url
                                links=page.links
                                #links=page.links.select{|i| i.to_s.match(/.*hatena.ne.jp*\/[0-9][0-9]*\/[0-9][0-9]*/)}
                                #
                                #links=page.links.select{|i| i.to_s.match(/.*hatena.ne.jp.*\/[0-9][0-9]*\/[0-9][0-9]*/)}
                                #
                                #puts links
                                #exit()
                                links=page.links.select{|i| i.to_s.match(/.*hatena.ne.jp\/#{id1}\/[0-9][0-9]*.*[0-9]*$/)}
                                #puts links
                                #exit()
                                url_list=url_list+(links.map{|i| i.to_s})


                        end

                end
                if not url_list.nil? then
                        url_list=url_list.uniq.sort
                        url_list_s=url_list.select{|i| i.to_s.match(/.*hatena.ne.jp\/#{id1}\/[0-9][0-9]*$/)}

                        url_list_out=url_list.select{|i| i.to_s.match(/.*hatena.ne.jp\/#{id1}\/[0-9][0-9]*\/[0-9][0-9]*$/)}

                        #puts url_list_s
                        #exit()
                        url_list_s.each{|j|

                                #puts j.split("/").reverse[0]
                                #if url_list.include?(/.*#{j.split("/").reverse[0]}.*/) then
                                #
                                if url_list_out.any?{|k| k.include?(j.split("/").reverse[0])} then
                                        url_list.delete(j)
                                        #puts "del"

                                end
                        }
                                #

                end
                puts url_list
                f1=open("data_list/url_list_"+id1+".csv","w")
                if not url_list.nil? then
                        url_list.each{|i| f1.puts(i.chomp)}
                end
                f1.close()

                #exit()

        }
        #exit()
#

        0.times.each{

                Anemone.crawl(url_list,:delay=>3,:depth_limit=>0) do |anemone|


                        anemone.on_every_page do |page|

                                puts page.url

                        end

                end

        }

        #exit()
        1.times.each{

                Anemone.crawl(url_list,:delay=>3,:depth_limit=>0) do |anemone|

                        anemone.on_every_page do |page|
                                puts page.url
                                page2=Nokogiri::HTML.parse(page.body)
                                v=page2.css("div.body")
                                url3=""
                                time1=""
                                title1=""
                                text1_con=""
                                time1=page2.css('span.date').text()
                                v.css('div.section').each{|j|
                                        if not j['id'].to_s.match(/google/) then
                                                j.css("h3").each{|k|
                                                        title1=k.text.to_s.encode('utf-8')
                                                        url3=k.css('a')[0]['href']
                                                                        #puts url3
                                                }
                                                #j.css("p.sectionfooter").each{|k|
                                                #
                                                #       str1=k.text.split("|")[3]
                                                #       if not str1.nil? then
                                                #               time1=str1.gsub(/\s/,'')
                                                #
                                                #       end
                                                #}

                                                #j.css("p.sectionfooter").each{|k|
                                                #
                                                #       str1=k.text.split("|")[3]
                                                #       if not str1.nil? then
                                                #               time1=str1.gsub(/\s/,'')
                                                #
                                                #       end
                                                #}
                                                text_main1=j.to_s.encode('utf-8')
                                                #puts text_main1
                                                #
                                                #text1_con=j.text().encode('utf-8').gsub(/\n/,'__SEPSEP__')
                                                #text1_con=j.css('pre').text().gsub(/\n/,'__SEPSEP__')
                                                text1_con0=j
                                                text1_con0.css('a.sectioncategory').remove

                                                text1_con=text1_con0.text().gsub(/\n/,'__SEPSEP__')


                                        ########puts url3
                                                filename=url3.to_s.gsub(/^\//,'').gsub('/','_PVSE_')

                                                filename="data_source/#{id1}/"+filename+".dat"
                                                f_out=open(filename,"w")
                                                f_out.write(text_main1)
                                                f_out.close()
                                                write_data=url3+"\t"+time1+"\t"+title1+"\t"+text1_con
                                                puts write_data
                                                f_out0.puts write_data
                                        end
                                }
                                #exit()
                #end
                        end
                end

        }
        f_out0.close()
        #gc()
end