TorasenLab@¤Ï¤Æ¤Ê ¤³¤Î¥Ú¡¼¥¸¤ò¥¢¥ó¥Æ¥Ê¤ËÄɲà RSS¥Õ¥£¡¼¥É

2011-10-06

opencv¤Î¥¤¥ó¥¹¥È¡¼¥ë¥í¥°

| 23:24 | opencv¤Î¥¤¥ó¥¹¥È¡¼¥ë¥í¥°¤ò´Þ¤à¥Ö¥Ã¥¯¥Þ¡¼¥¯ opencv¤Î¥¤¥ó¥¹¥È¡¼¥ë¥í¥°¤Î¥Ö¥Ã¥¯¥Þ¡¼¥¯¥³¥á¥ó¥È

´Ä¶­

  • Ubuntu 10.04 64¥Ó¥Ã¥È
  • python 2.6.5

¥¤¥ó¥¹¥È¡¼¥ë¥í¥°

´ðËÜŪ¤Ë

InstallGuide : Debian - OpenCV Wiki

¤Ë½ñ¤¤¤Æ¤¢¤ë¤È¤ª¤ê¼Â¹Ô¤·¤Þ¤·¤¿¡£

°ìÉô¥¤¥ó¥¹¥È¡¼¥ë¤¹¤ë¥Ñ¥Ã¥±¡¼¥¸¤òÊѹ¹¤·¤Æ¤¤¤Þ¤¹¡£

IPP¤äTBB¤Ï¥¤¥ó¥¹¥È¡¼¥ë¤·¤Æ¤¤¤Þ¤»¤ó¡£python¥é¥Ã¥Ñ¡¼¤Ï¥¤¥ó¥¹¥È¡¼¥ë¤·¤Þ¤·¤¿¡£

sudo aptitude -y install build-essential
sudo aptitude -y install cmake
sudo aptitude -y install pkg-config
sudo aptitude -y install libpng12-0 libpng12-dev libpng++-dev libpng3
sudo aptitude -y install libpnglite-dev libpngwriter0-dev libpngwriter0c2
sudo aptitude -y install zlib1g-dbg zlib1g zlib1g-dev
sudo aptitude -y install libjasper-dev libjasper-runtime libjasper1
sudo aptitude -y install pngtools libtiff4-dev libtiff4 libtiffxx0c2 libtiff-tools
sudo aptitude -y install libjpeg62 libjpeg62-dev libjpeg62-dbg libjpeg-progs
sudo aptitude -y install ffmpeg libavcodec-dev libavcodec52 libavformat52 libavformat-dev
sudo aptitude -y install libgstreamer0.10-0-dbg libgstreamer0.10-0 libgstreamer0.10-dev
sudo aptitude -y install libxine1-ffmpeg libxine-dev libxine1-bin
sudo aptitude -y install libunicap2 libunicap2-dev
sudo aptitude -y install libdc1394-22-dev libdc1394-22 libdc1394-utils
sudo aptitude -y install swig
sudo aptitude -y install libv4l-0 libv4l-dev
sudo aptitude -y install python-numpy


sudo aptitude -y install libpython2.6 python-dev python2.6-dev


sudo aptitude -y install libjpeg-progs libjpeg-dev


sudo aptitude -y install libgstreamer-plugins-base0.10-dev


mkdir ocv
cd ocv/
# sudo aptitude install subversion
svn co https://code.ros.org/svn/opencv/trunk

cd trunk/opencv/
mkdir release
cd release/
cmake -D CMAKE_BUILD_TYPE=RELEASE -D CMAKE_INSTALL_PREFIX=/usr/local -D BUILD_PYTHON_SUPPORT=ON -D BUILD_EXAMPLES=ON ..
make
sudo make install

sudo ldconfig
cd unix-install
pkg-config opencv --libs

ưºî¥Á¥§¥Ã¥¯

C++¤Ç¤Î¥Á¥§¥Ã¥¯

¥À¥¦¥ó¥í¡¼¥É¤·¤¿¥Õ¥¡¥¤¥ë¤Îtrunk/opencv/samples/cpp/¥Ç¥£¥ì¥¯¥È¥ê¤Î¥Õ¥¡¥¤¥ë¤¬¥³¥ó¥Ñ¥¤¥ë¤Ç¤­¤ë¤«¤É¤¦¤«¡£

¥³¥ó¥Ñ¥¤¥ë¤Ï¡¢

g++ example.cc `pkg-config opencv --cflags --libs`

¤Ê¤É¡£

python¥é¥Ã¥Ñ¡¼¤Î¥Á¥§¥Ã¥¯

python¤Çimport cv¤¬À®¸ù¤¹¤ë¤«¤É¤¦¤«¡£

¥À¥¦¥ó¥í¡¼¥É¤·¤¿¥Õ¥¡¥¤¥ë¤Îtrunk/opencv/samples/python/delaunay.py¤¬python¤Ç¼Â¹Ô¤Ç¤­¤ë¤«¤É¤¦¤«¡£

»²¹Í

InstallGuide : Debian - OpenCV Wiki

2011-09-14

¸«½Ð¤·¸ì²½¤Î¹â®²½

| 00:59 | ¸«½Ð¤·¸ì²½¤Î¹â®²½¤ò´Þ¤à¥Ö¥Ã¥¯¥Þ¡¼¥¯ ¸«½Ð¤·¸ì²½¤Î¹â®²½¤Î¥Ö¥Ã¥¯¥Þ¡¼¥¯¥³¥á¥ó¥È

nltk¤ÎWordNetLemmatizer¤òÎϤº¤¯¤Ç¹â®²½¤·¤¿¡£

´Ä¶­

Python 2.6.5

¥³¡¼¥É

# -*- coding: utf-8 -*-

from collections import defaultdict
import nltk
from nltk.corpus import wordnet as _wordnet


_STEMMER = nltk.PorterStemmer().stem
_LEMMATIZATION_POS_PRIORITY = (_wordnet.NOUN, _wordnet.VERB,
                               _wordnet.ADJ, _wordnet.ADV)
_POS_LIST = (_wordnet.ADJ, _wordnet.ADV, _wordnet.NOUN, _wordnet.VERB)


def stem_form(form):
    return _STEMMER(form)


def _detect_pos(form):
    form = form.replace(' ', '_')
    synsets = _wordnet.synsets(form)
    if not synsets:
        return None
    pos = None
    stem = stem_form(form)
    for synset in synsets:
        if stem_form(synset.name[:-5]) == stem:
            pos = synset.pos
            break
    if pos is None:
        pos = synsets[0].pos
    if pos == _wordnet.ADJ_SAT:
        pos = _wordnet.ADJ
    return pos


def lemmatize_with_wordnet(form, pos=None):
    if pos is None:
        pos = _detect_pos(form)
        if not pos:
            return form
    assert(pos in _POS_LIST)
    return nltk.WordNetLemmatizer().lemmatize(form, pos=pos)


def _lemmatize_form_with_wordnet(form, pos_set):
    assert(pos_set)
    if len(pos_set) == 1:
        target_pos = pos_set.copy().pop()
    else:
        target_pos = _detect_pos(form)
        if not target_pos or target_pos not in pos_set:
            for pos in _LEMMATIZATION_POS_PRIORITY:
                if pos in pos_set:
                    target_pos = pos
                    break
    assert(target_pos in _POS_LIST)
    return nltk.WordNetLemmatizer().lemmatize(form, pos=target_pos)


def _construct_inflected_form_to_lemma_dictionary():
    all_inflected_forms = defaultdict(set)
    for pos, excepted_forms in _wordnet._exception_map.iteritems():
        if pos == _wordnet.ADJ_SAT:
            continue
        for excepted_form in excepted_forms:
            all_inflected_forms[excepted_form].add(pos)

    for pos in _POS_LIST:
        substitutions = _wordnet.MORPHOLOGICAL_SUBSTITUTIONS[pos]
        for lemma in _wordnet.all_lemma_names(pos=pos):
            lemma = lemma.replace('_', ' ')
            all_inflected_forms[lemma].add(pos)

            form = lemma
            if pos == _wordnet.NOUN and form.endswith('ful'):
                suffix = 'ful'
                form = form[:-3]  # len('ful')
            else:
                suffix = ''
            for new_suffix, old_suffix in substitutions:
                if form.endswith(old_suffix) or old_suffix == '':
                    if old_suffix == '':
                        inflected_form = form + new_suffix
                    else:
                        inflected_form = form[:-len(old_suffix)] + new_suffix
                    inflected_form += suffix
                    all_inflected_forms[inflected_form].add(pos)
    inflected_form_to_lemma = {}
    for inflected_form, pos_set in all_inflected_forms.iteritems():
        lemma = _lemmatize_form_with_wordnet(inflected_form, pos_set)
        inflected_form_to_lemma[inflected_form] = lemma.replace('_', ' ')
    return inflected_form_to_lemma


_INFLECTED_FORM_TO_LEMMA = _construct_inflected_form_to_lemma_dictionary()


def lemmatize_with_dict(form):
    try:
        return _INFLECTED_FORM_TO_LEMMA[form.lower().replace(' ', '_')]
    except KeyError:
        pass
    return form.lower()


def _test():
    test_forms = ('media', 'playing', 'player', 'possesses', 'sung', 'became',
                  'begun', 'fallen', 'men', 'buses', 'initial', 'initialization')
    print 'original\tstemming\twordnet_lemmatizer\tlemmatize_with_dict'
    for form in test_forms:
        print ('{0}\t{1}\t{2}\t{3}').format(form,
                                            stem_form(form),
                                            lemmatize_with_wordnet(form),
                                            lemmatize_with_dict(form))


if __name__ == '__main__':
    _test()

'''
original        stemming   wordnet_lemmatizer   lemmatize_with_dict
media           media      medium               medium
playing         play       playing              playing
player          player     player               player
possesses       possess    posse                possess
sung            sung       sung                 sung
became          becam      become               become
begun           begun      begin                begin
fallen          fallen     fall                 fall
men             men        man                  man
buses           buse       bus                  bus
initial         initi      initial              initial
initialization  initi      initialization       initialization
'''

nltk¤ÎWordNetLemmatizer¤Îưºî¤Ï¡¢ÆþÎϸì¶ç¤ËÂФ·¤Æ°Ê²¼¤Î¼ê½ç¤Ç¸«½Ð¤·¸ì²½¤ò¹Ô¤Ã¤Æ¤¤¤ë¤è¤¦¤Ç¤¹¡£

  1. ÆþÎϸì¶ç¤¬Îã³°¥ê¥¹¥È¤ËºÜ¤Ã¤Æ¤¤¤Ê¤¤¤«¤É¤¦¤«¥Á¥§¥Ã¥¯¤¹¤ë¡£ºÜ¤Ã¤Æ¤¤¤ì¤ÐÊÌÅÓ½èÍý¤¹¤ë¡£
  2. ËöÈøÃÖ´¹¥ë¡¼¥ë¤òÍѤ¤¤ÆÆþÎϸì¶ç¤ÎËöÈø¤òÃÖ´¹¤¹¤ë¡£
  3. ¼­½ñ¤Î¸«½Ð¤·¸ì¤Ë¸ì¶ç¤¬Â¸ºß¤·¤Æ¤¤¤ë¤«¤É¤¦¤«¤ò¥Á¥§¥Ã¥¯¤¹¤ë¡£
  4. ËöÈøÃÖ´¹¥ë¡¼¥ë¤¬Ìµ¤¯¤Ê¤ë¤Þ¤Ç¼ê½ç2.¤È3.¤ò·«¤êÊÖ¤¹¡£

¼ê½ç1.¤ÎÎã³°¥ê¥¹¥È¤È¤Ï¡¢¼ç¤ËÉÔµ¬Â§ÊѲ½(sing-sang-sung¤Ê¤É)¤ò²ò·è¤¹¤ë¤¿¤á¤Î¼ê½ç¤Î¤è¤¦¤Ç¤¹¡£

¼ê½ç2.¤È¼ê½ç3.¤Ç¤Ï¡¢¤¢¤é¤«¤¸¤áÍѰդµ¤ì¤¿ËöÈøÃÖ´¹¥ë¡¼¥ë¤ò»ÈÍѤ·¤Þ¤¹¡£

Î㤨¤ÐÆþÎϸì¶ç¤¬Ì¾»ì¤Ç¤¢¤ê¡¢xes¤Ç½ª¤ï¤Ã¤Æ¤¤¤ë¾ì¹ç¡¢ËöÈø¤«¤éxes¤ò¼è¤ê½ü¤­¡¢Âå¤ï¤ê¤Ëx¤òÄɲ乤롣¤½¤Î¸å¡¢ÃÖ´¹¤·¤¿Ã±¸ì¤¬¼­½ñ¤Î¸«½Ð¤·¸ì¤Ë¤¢¤ë¤«¤É¤¦¤«¤ò¥Á¥§¥Ã¥¯¤¹¤ë¡£Î㤨¤ÐÆþÎϸì¶ç¤¬boxes¤Ç¤¢¤ë¤È¤­¡¢ËöÈø¤Îxes¤òx¤ËÃÖ´¹¤·box¤È¤·¡¢¤½¤Î¸åbox¤¬¼­½ñ¤Î¸«½Ð¤·¸ì¤Ë¤¢¤ë¤«¤É¤¦¤«¥Á¥§¥Ã¥¯¤¹¤ë¡£

¥½¡¼¥¹¥³¡¼¥É¤òÆÉ¤à¸Â¤ê¡¢nltk¤ÎWordNetLemmatizer¤ÏËܲÈWordNet¤ÎC¸À¸ì¼ÂÁõ¤ÈƱÍͤμê½ç¤Ç¸«½Ð¤·¸ì²½¤ò¹Ô¤Ã¤Æ¤¤¤ë¤é¤·¤¤¡£

ËöÈø¤ÎÃÖ´¹¡¢¸¡º÷¡¢¤ò·«¤êÊÖ¤·¤Æ¤¤¤ë¤Î¤Ç¼Â¹Ô®ÅÙ¤¬ÃÙ¤¤¡£

¤½¤³¤Ç¡¢¤³¤ì¤éÎã³°¥ê¥¹¥È¤ÈÃÖ´¹¥ë¡¼¥ë¤ò»ÈÍѤ·¤Æ¡¢WordNetLemmatizer¤¬½èÍý¤Ç¤­¤ë¸ì¶ç¤ò¤¹¤Ù¤ÆÀ¸À®¤¹¤ë¡£

¤½¤ì¤éWordNetLemmatizer¤¬½èÍý¤Ç¤­¤ë¸ì¶ç¤ò¥­¡¼¤È¤·¡¢¤½¤ì¤é¤Î¸«½Ð¤·¸ì¤òÃͤȤ¹¤ë¼­½ñ(_INFLECTED_FORM_TO_LEMMA)¤òºîÀ®¤·¤¿¡£

{'boxes': 'box', 'box': 'box', ..., 'media': 'medium', 'medium': 'medium', ...}¤Î¤è¤¦¤Ê³èÍÑ·Á¤¬¥­¡¼¤Ç¤¢¤ê¡¢¸«½Ð¤·¸ì¤¬ÃͤǤ¢¤ë¼­½ñ¡£

¼Â¹Ô®Å٥ƥ¹¥È

American National Corpus¤«¤éÃê½Ð¤·¤¿16,814,123¸Ä¤Î±Ññ¸ì(296,528¼ïÎà)¤ËÂФ·¤Æ¡¢¸«½Ð¤·¸ì²½¤Ë¤«¤«¤ë»þ´Ö¤ò¬¤Ã¤¿¡£

nltk¤ÎWordNetLemmatizer¤Ë¤è¤ë¸«½Ð¤·¸ì²½(lemmatize_with_wordnet)¤Ç¤Ï3583ÉÃ(¤ª¤è¤½1»þ´Ö)¡¢

nltk¤ÎWordNetLemmatizer¤Ë¤è¤ë¸«½Ð¤·¸ì²½(lemmatize_with_wordnet)¤ÎÆþ½ÐÎϤò¥­¥ã¥Ã¥·¥å¤·¤¿¾ì¹ç¤Ç¤Ï80Éá¢

º£²óºîÀ®¤·¤¿ºîÀ®¤·¤¿¼­½ñ¤ò»ÈÍѤ·¤¿¸«½Ð¤·¸ì²½(lemmatize_with_dict)¤Ç¤Ï20Éá¢

¤È¤Ê¤Ã¤¿¡£·ë¶ÉÆþ½ÐÎϤò¥­¥ã¥Ã¥·¥å¤·¤¿¾ì¹ç¤ÈÂ纹̵¤¤·ë²Ì¤Ë¤Ê¤Ã¤¿¡£

¤·¤«¤·¡¢Æþ½ÐÎϤò¥­¥ã¥Ã¥·¥å¤¹¤ë¾ì¹ç¡¢WordNet¤ËÅÐÏ¿¤µ¤ì¤Æ¤¤¤Ê¤¤Ã±¸ì¤ä¸Çͭ̾»ì¤Ê¤É¤¬ÆþÎϤµ¤ì¤ë¤¿¤Ó¡¢¥­¥ã¥Ã¥·¥å¤Î¥µ¥¤¥º¤¬Â礭¤¯¤Ê¤ë¡£

¤½¤ÎÅÀ¡¢º£²óºîÀ®¤·¤¿¼­½ñ¤Î¥µ¥¤¥º¤Ï¸ÇÄꤵ¤ì¤Æ¤¤¤ë¤Î¤Ç¡¢¥á¥â¥ê¾ÃÈñÎ̤ÎÁý²Ã¤Ê¤É¤òµ¤¤Ë¤·¤Ê¤¯¤ÆÎɤ¤¡£

º£²óºîÀ®¤·¤¿¼­½ñ¤Ë¤âÌäÂêÅÀ¤¬¤¢¤ë¡£º£²óºîÀ®¤·¤¿¼­½ñ¤Ç¤Ï¡¢ÆþÎϸì¶ç¤«¤é½ÐÎϤ¬°ì°Õ¤Ë·è¤Þ¤ë¤¬¡¢Ä̾ï¤Ï°ì°Õ¤Ë¤Ï·è¤Þ¤é¤Ê¤¤¡£Î㤨¤Ð¡¢better¤Î¸«½Ð¤·¸ì¤Ïwell¤«good¤«¤ÏÉÊ»ì¤òÍѤ¤¤Ê¤¤¸Â¤ê·è¤á¤Ë¤¯¤¤¡£

2011-08-06

python¤Ë¤è¤ëʸ»úÎó¤ÎÀµµ¬²½

| 00:31 | python¤Ë¤è¤ëʸ»úÎó¤ÎÀµµ¬²½¤ò´Þ¤à¥Ö¥Ã¥¯¥Þ¡¼¥¯ python¤Ë¤è¤ëʸ»úÎó¤ÎÀµµ¬²½¤Î¥Ö¥Ã¥¯¥Þ¡¼¥¯¥³¥á¥ó¥È

¥Æ¥­¥¹¥È¥Þ¥¤¥Ë¥ó¥°¤Ê¤É¤ò¹Ô¤¦¤¿¤á¤Ë¤Ïʸ½ñ¡¢Ê¸¡¢Ã±¸ì¤Ê¤É¤Îʸ»úÎó¤ÎÀµµ¬²½¤¬½ÅÍפǤ¹¡£

ñ¸ì¤ÎÂçʸ»ú¾®Ê¸»ú¤ÎÅý°ì¡¢È¾³ÑÁ´³Ñ¤ÎÅý°ì¤Ê¤É¤ò¤¹¤ëɬÍפ¬¤¢¤ê¤Þ¤¹¡£

ʸ»úÎó¤ÎÀµµ¬²½¤Î¤¿¤á¤ËÍøÍѤ·¤Æ¤¤¤ëpython¥³¡¼¥É¤ò°Ê²¼¤Ë½ñ¤¤¤Æ¤ª¤­¤Þ¤¹¡£

º£¸åÁý¤¨¤ë²ÄǽÀ­¤â¤¢¤ê¤Þ¤¹¡£

¼Â¹Ô´Ä¶­

Ubuntu 10.04 64¥Ó¥Ã¥È

python 2.6.5

unicode·¿¤ËÊÑ´¹¤¹¤ë

def unicode_ignore_invalid_char(text):
    if isinstance(text, str):
        return text.decode('utf-8', 'ignore')
    return text

ÊÑ´¹ÉÔǽ¤Êʸ»úÎó¤ò̵»ë¤·¤Æstr·¿¤«¤éunicode·¿¤ËÊÑ´¹¤¹¤ë¡£

str·¿¤ËÊÑ´¹¤¹¤ë

def str_ignore_invalid_char(text):
    if isinstance(text, unicode):
        return text.encode('utf-8', 'ignore')
    return text

ÊÑ´¹ÉÔǽ¤Êʸ»úÎó¤ò̵»ë¤·¤Æunicode·¿¤«¤éstr·¿¤ËÊÑ´¹¤¹¤ë¡£

Æþ½ÐÎϤÎʸ»úÎ󷿤òÅý°ì¤¹¤ë

from functools import wraps

def consistent_texttype(function):
    @wraps(function)
    def _consistent_texttype(*args, **kwargs):
        assert(1 <= len(args))
        input_text = args[0]
        is_unicode = False
        if isinstance(input_text, unicode):
            is_unicode = True
        elif not isinstance(input_text, str):
            is_unicode = isinstance(input_text[0], unicode)  # for collections
        output_text = function(*args, **kwargs)
        if isinstance(output_text, unicode) or isinstance(output_text, str):
            if is_unicode:
                return unicode_ignore_invalid_char(output_text)
            return str_ignore_invalid_char(output_text)
        if is_unicode:
            return map(unicode_ignore_invalid_char, output_text)
        return map(str_ignore_invalid_char, output_text)
    return _consistent_texttype

ÆþÎÏʸ»úÎó¤¬str·¿¤Ç¤¢¤ë¤È¤­½ÐÎÏʸ»úÎó¤âstr·¿¤Ë¤·¡¢ÆþÎÏʸ»úÎó¤¬unicode·¿¤Ç¤¢¤ë¤È¤­½ÐÎÏʸ»úÎó¤âunicode·¿¤Ë¤¹¤ë¥Ç¥³¥ì¡¼¥¿¡£

unicode¤òÀµµ¬²½¤¹¤ë

import unicodedata

@consistent_texttype
def normalize_unicode(text, form='NFKC'):
    assert(form in ('NFC', 'NFKC', 'NFD', 'NFKD'))
    unicode_text = unicode_ignore_invalid_char(text)
    normalized_text = unicodedata.normalize(form, unicode_text)
    return normalized_text

Ⱦ³Ñ¥«¥¿¥«¥Ê¤òÁ´³Ñ¥«¥¿¥«¥Ê¤ËÊÑ´¹¤·¤¿¤ê¤¹¤ë¡£

Î㤨¤Ð­Ï¤ò¥Ú¡¼¥¸¡¢ŽÊŽÝ޶ޏ޶ŽÅ¤ò¥Ï¥ó¥«¥¯¥«¥Ê¤ËÊÑ´¹¤¹¤ë¡£

HTML¥¨¥ó¥Æ¥£¥Æ¥£¤òÊÑ´¹¤¹¤ë

from BeautifulSoup import BeautifulSoup

@consistent_texttype
def unescape_entities_with_beautifulsoup(htmltext, prettify=False):
    soup = BeautifulSoup(htmltext, convertEntities=BeautifulSoup.HTML_ENTITIES)
    if prettify:
        return soup.prettify()
    return soup.__repr__()

BeautifulSoup¤òÍøÍѤ·¤ÆHTML¥¨¥ó¥Æ¥£¥Æ¥£¤òÊÑ´¹¤¹¤ë¡£

Î㤨¤Ð&gt;¤ò>¤ËÊÑ´¹¤¹¤ë¡£

from BeautifulSoup import BeautifulStoneSoup

@consistent_texttype
def unescape_entities_with_beautifulstonesoup(htmltext, prettify=False):
    soup = BeautifulStoneSoup(htmltext,
                              convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
    if prettify:
        return soup.prettify()
    return soup.__repr__()

BeautifulStoneSoup¤òÍøÍѤ·¤ÆHTML¥¨¥ó¥Æ¥£¥Æ¥£¤òÊÑ´¹¤¹¤ë¡£

BeautifulSoup¤òÍøÍѤ·¤¿¾ì¹ç¤ÈƱ¤¸¤«¤â¤·¤ì¤Ê¤¤¡£

from htmlentitydefs import name2codepoint
import re

# derived from BeautifulSoup
# __author__ = "Leonard Richardson (leonardr@segfault.org)"
# __version__ = "3.1.0.1"
# __copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
# __license__ = "New-style BSD"
def _unescape_entity(match):
    x = match.group(1)
    if x in name2codepoint:
        return unichr(name2codepoint[x])
    elif 0 < len(x) and x[0] == '#':
        if 1 < len(x) and x[1] == 'x':
            return unichr(int(x[2:], 16))
        return unichr(int(x[1:]))
    return u'&{0};'.format(x)


@consistent_texttype
def unescape_entities(htmltext):
    unicode_htmltext = unicode_ignore_invalid_char(htmltext)
    unescaped_text = re.sub(u'&(#\d+|#x[0-9a-fA-F]+|\w+);',
                            _unescape_entity, unicode_htmltext)
    assert(isinstance(unescaped_text, unicode))
    return unescaped_text

BeautifulSoup¤ÎHTML¥¨¥ó¥Æ¥£¥Æ¥£ÊÑ´¹Éôʬ¤òÃê½Ð¤·¡¢¾¯¤·Êѹ¹¤ò²Ã¤¨¤¿¤â¤Î¡£

HTML¥¨¥ó¥Æ¥£¥Æ¥£ÊÑ´¹¤Î¤¿¤á¤À¤±¤ËBeautifulSoup¤òÍøÍѤ¹¤ë¤Î¤Ï¹â²Á¤¹¤®¤ë¤È¹Í¤¨¤ë¤È¤­¤Ï¤³¤Á¤é¤òÍøÍѤ¹¤ë¡£

BeautifulSoup¤ÏNew-style BSD¥é¥¤¥»¥ó¥¹¤Ç¤¹¡£

¸ì´´¤òÃê½Ð¤¹¤ë(¥¹¥Æ¥ß¥ó¥°)

import nltk

@consistent_texttype
def stem_term(term, porter=True):
    if porter:
        return nltk.PorterStemmer().stem(term)
    return nltk.LancasterStemmer().stem(term)

±Ñ¸ìÍÑ¡£Î㤨¤Ðinitial, initialize¤òiniti¤Ë¤¹¤ë¡£

¸«½Ð¤·¸ì²½¡¦¥ì¥ó¥Þ²½(lemmatization)

import nltk
from nltk.corpus import wordnet

@consistent_texttype
def lemmatize_term(term, pos=None):
    if pos is None:
        synsets = wordnet.synsets(term)
        if not synsets:
            return term
        pos = synsets[0].pos
        if pos == wordnet.ADJ_SAT:
            pos = wordnet.ADJ
    assert(pos in (wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV))
    return nltk.WordNetLemmatizer().lemmatize(term, pos=pos)

±Ñ¸ìÍÑ¡£WordNet¤òÍѤ¤¤ÆÃ±¸ì¤Î¸«½Ð¤·¸ì²½¤ò¹Ô¤¦¡£

Î㤨¤Ðis, are¤òbe¤Ë¡¢potatos¤òpotato¤Ë¤¹¤ë¡£

ÉÊ»ì(pos)¤Î»ØÄ꤬¤Ê¤±¤ì¤Ðsynsets¤Î¤¦¤Á¡¢°ìÈֺǽé¤Ë¸½¤ì¤ëÉÊ»ì¤ò»ÈÍѤ¹¤ë¡£

¾®Ê¸»ú¤Ë¤¹¤ë

text.lower()

¤â¤·¤¯¤Ï¡¢

import string

def lower_text(text):
    return string.lower(text)

Âçʸ»ú¤Ë¤¹¤ë

text.upper()

¤â¤·¤¯¤Ï¡¢

import string

def upper_text(text):
    return string.upper(text)

ÀèÆ¬¤Î¤ßÂçʸ»ú¤Ë¤¹¤ë

text.capitalize()

¤â¤·¤¯¤Ï¡¢

import string

def capitalize_text(text):
    return string.capitalize(text)

»²¹Í¥µ¥¤¥È

UnicodeDecodeError¤¬È¯À¸¤¹¤ëʸ»ú¤òignore¥ª¥×¥·¥ç¥ó¤Ç̵»ë¤¹¤ë - Pyro Memo

»²¹Íʸ¸¥

ÆþÌç ¼«Á³¸À¸ì½èÍý

ÆþÌç ¼«Á³¸À¸ì½èÍý

2011-07-22

nkf python ¥¤¥ó¥¿¡¼¥Õ¥§¡¼¥¹¤Î¥¤¥ó¥¹¥È¡¼¥ë

| 22:49 | nkf python ¥¤¥ó¥¿¡¼¥Õ¥§¡¼¥¹¤Î¥¤¥ó¥¹¥È¡¼¥ë¤ò´Þ¤à¥Ö¥Ã¥¯¥Þ¡¼¥¯ nkf python ¥¤¥ó¥¿¡¼¥Õ¥§¡¼¥¹¤Î¥¤¥ó¥¹¥È¡¼¥ë¤Î¥Ö¥Ã¥¯¥Þ¡¼¥¯¥³¥á¥ó¥È

´Ä¶­

  • Ubuntu 10.04 32¥Ó¥Ã¥È
  • python 2.6.5

¥¤¥ó¥¹¥È¡¼¥ëÊýË¡

$ mkdir temp  # ºî¶ÈÍѥǥ£¥ì¥¯¥È¥ê¤ÎºîÀ®
$ cd temp
# http://sourceforge.jp/projects/nkf/ ¤«¤é nkf-2.1.1.tar.gz ¤ò¥À¥¦¥ó¥í¡¼¥É
temp$ tar zxvf nkf-2.1.1.tar.gz
temp$ cd nkf-2.1.1/

temp/nkf-2.1.1$ wget ftp://city.plala.jp:1221/NkfPython/NKF_python20090602.tgz
temp/nkf-2.1.1$ tar zxvf NKF_python20090602.tgz
temp/nkf-2.1.1$ cd NKF.python/
temp/nkf-2.1.1/NKF.python$ sudo python setup.py install
# Python.h: No such file or directory ¤Èɽ¼¨¤µ¤ì¤¿¾ì¹ç¤Ï¡¢°Ê²¼¤ò¼Â¹Ô¤¹¤ë
# sudo aptitude install python-dev
temp/nkf-2.1.1/NKF.python$ cd ../../../
$ sudo rm -rf temp

»ÈÍÑÊýË¡

import nkf
flag = '-w'
output = nkf.nkf(flag, input_text)

input_code = nkf.guess(input_text)

»²¹Í¥ê¥ó¥¯

no title

nkf for python¤ÎÆþ¤ìÊý¤È»È¤¤Êý - ¤Õ¤Ë¤ã¤ë¤ó

2011-05-28

Python¤Ë¤ª¤±¤ëʹԽèÍý¤Ë¤Ä¤¤¤Æ

| 23:47 | Python¤Ë¤ª¤±¤ëʹԽèÍý¤Ë¤Ä¤¤¤Æ¤ò´Þ¤à¥Ö¥Ã¥¯¥Þ¡¼¥¯ Python¤Ë¤ª¤±¤ëʹԽèÍý¤Ë¤Ä¤¤¤Æ¤Î¥Ö¥Ã¥¯¥Þ¡¼¥¯¥³¥á¥ó¥È

µ¤¤Ë¤Ê¤Ã¤¿¤Î¤ÇPython¤ÎGIL(Global Interpreter Lock)¤¬Ê¹ԽèÍý¤Ë¤É¤ÎÄøÅٱƶÁ¤¹¤ë¤«¤Ë¤Ä¤¤¤Æ¾¯¤·¼Â¸³¤·¤Þ¤·¤¿¡£

¤Ï¤¸¤á¤Ë

¤Þ¤º¡¢¡Öʹԡ׽èÍý¤È¡ÖÊÂÎó¡×½èÍý¤È¤¤¤¦¸ÀÍÕ¤ò¶èÊ̤¹¤ëɬÍפ¬¤¢¤ê¤Þ¤¹¡£²¼µ­»²¹Íʸ¸¥¤Î¡Öʹԥ³¥ó¥Ô¥å¡¼¥¿µ»Ë¡¡×¤Ë¤è¤ë¤È¡¢

 ¥·¥¹¥Æ¥à¤¬Ê£¿ô¤Îưºî¤òƱ»þ¤Ë¼Â¹Ô¾õÂÖ(in progress)¤ËÊݤƤ뵡ǽ¤òÈ÷¤¨¤Æ¤¤¤ë¾ì¹ç¤òʹÔ(concurrent)¤È¸À¤¤¡¢
Ê£¿ô¤Îưºî¤òƱ»þ¤Ë¼Â¹Ô¤Ç¤­¤ë¾ì¹ç¤òÊÂÎó(parallel)¤È¸À¤¤¤Þ¤¹¡£
 ½ÅÍפʳµÇ°¡¢°ã¤¤¤Ï¡Ö¼Â¹Ô¾õÂ֡פȤ¤¤¦ÅÀ¤Ç¤¹¡£
...ÃæÎ¬...
 ¡ÖʹԡפϡÖÊÂÎó¡×¤ò´ÞÍ­¤·¤Þ¤¹¡£

¤À¤½¤¦¤Ç¤¹¡£Æ±½ñ¤Ë¤è¤ë¤È¡¢1¤Ä¤ÎCPU¥³¥¢¤¬2¤Ä¤Î¥¹¥ì¥Ã¥É¤òÀÚ¤êÂØ¤¨¤Ê¤¬¤é½èÍý¤¹¤ë¾ì¹ç¤Ï¡Öʹԡ׽èÍý¤Ë´Þ¤Þ¤ì¤ë¤è¤¦¤Ç¤¹¡£¡ÖÊÂÎó¡×½èÍý¤Ç¤ÏÊ£¿ô¤ÎCPU¥³¥¢¤¬É¬¿Ü¤Ç¡¢Ê£¿ô¤Î¥¹¥ì¥Ã¥É¤¬Ê£¿ô¤ÎCPU¥³¥¢¤Ë¤è¤êƱ»þ¤Ë¼Â¹Ô¤µ¤ì¤ë»ö¤ò¡ÖÊÂÎó¡×½èÍý¤È¸À¤¦¤è¤¦¤Ç¤¹¡£

Ìܻؤ¹¤Ù¤­¤Ï¡Öʹԡ׽èÍý¤Ç¤Ï¤Ê¤¯¡ÖÊÂÎó¡×½èÍý¤Êµ¤¤¬¤·¤Þ¤¹¡£

¥Æ¥¹¥È´Ä¶­

  • Intel Core i7-920 Processor (2.66 GHz ¡ß 8)
  • Memory 9GB
  • Ubuntu Lucid 10.04 64bit
  • Python 2.6.5

¥Æ¥¹¥È¥³¡¼¥É

1. Ã༡½èÍý¤Ç¤ÎCPUÉé²Ù¤ÎÂ礭¤¤½èÍý

# sequential_cpu.py


def _cpu_bound_work():
    i = 0
    while i < 100000000:
        i += 1


if __name__ == '__main__':
    for _ in xrange(8):
        _cpu_bound_work()

2. threading¤Ç¤ÎCPUÉé²Ù¤ÎÂ礭¤¤½èÍý

# threading_cpu.py

import threading


def _cpu_bound_work():
    i = 0
    while i < 100000000:
        i += 1


class TestThread(threading.Thread):
    def run(self):
        _cpu_bound_work()


if __name__ == '__main__':
    mainthread = threading.currentThread()
    for _ in xrange(8):
        thread = TestThread()
        thread.start()
    for thread in threading.enumerate():
        if mainthread != thread:
            thread.join()

3. multiprocessing¤Ë¤è¤ëCPUÉé²Ù¤ÎÂ礭¤¤½èÍý

# multiprocessing_cpu.py

import multiprocessing


def _cpu_bound_work():
    i = 0
    while i < 100000000:
        i += 1


class TestProcess(multiprocessing.Process):
    def run(self):
        _cpu_bound_work()


if __name__ == '__main__':
    for _ in xrange(8):
        process = TestProcess()
        process.start()
    for process in multiprocessing.active_children():
        process.join()

4. Ã༡½èÍý¤Ë¤è¤ëIOÂÔ¤Á¤ÎÂ礭¤¤½èÍý

# sequential_io.py

import time


def _io_bound_work():
    time.sleep(10.0)  # to simulate i/o bound work


if __name__ == '__main__':
    for _ in xrange(8):
        _io_bound_work()

5. threading¤Ë¤è¤ëIOÂÔ¤Á¤ÎÂ礭¤¤½èÍý

# threading_io.py

import threading
import time


def _io_bound_work():
    time.sleep(10.0)  # to simulate i/o bound work


class TestThread(threading.Thread):
    def run(self):
        _io_bound_work()


if __name__ == '__main__':
    mainthread = threading.currentThread()
    for _ in xrange(8):
        thread = TestThread()
        thread.start()
    for thread in threading.enumerate():
        if mainthread != thread:
            thread.join()

6. multiprocessing¤Ë¤è¤ëIOÂÔ¤Á¤ÎÂ礭¤¤½èÍý

# multiprocessing_io.py

import multiprocessing
import time


def _io_bound_work():
    time.sleep(10.0)  # to simulate i/o bound work


class TestProcess(multiprocessing.Process):
    def run(self):
        _io_bound_work()


if __name__ == '__main__':
    for _ in xrange(8):
        process = TestProcess()
        process.start()
    for process in multiprocessing.active_children():
        process.join()

¥Æ¥¹¥È·ë²Ì

1. Ã༡½èÍý¤Ë¤è¤ëCPUÉé²Ù¤ÎÂ礭¤¤½èÍý

$ time python sequential_cpu.py
real	0m45.265s
user	0m45.230s
sys	0m0.020s

2. threading¤Ë¤è¤ëCPUÉé²Ù¤ÎÂ礭¤¤½èÍý

$ time python threading_cpu.py 
real	1m8.033s
user	1m7.420s
sys	0m16.930s

3. multiprocessing¤Ë¤è¤ëCPUÉé²Ù¤ÎÂ礭¤¤½èÍý

$ time python multiprocessing_cpu.py 
real	0m10.969s
user	1m24.960s
sys	0m0.040s

4. Ã༡½èÍý¤Ë¤è¤ëIOÂÔ¤Á¤ÎÂ礭¤¤½èÍý

$ time python sequential_io.py 
real	1m20.095s
user	0m0.010s
sys	0m0.010s

5. threading¤Ë¤è¤ëIOÂÔ¤Á¤ÎÂ礭¤¤½èÍý

$ time python threading_io.py 
real	0m10.029s
user	0m0.020s
sys	0m0.000s

6. multiprocessing¤Ë¤è¤ëIOÂÔ¤Á¤ÎÂ礭¤¤½èÍý

$ time python multiprocessing_io.py 
real	0m10.035s
user	0m0.020s
sys	0m0.010s

¤Þ¤È¤á

1,4¤ÎÃ༡½èÍý¤¬ÃÙ¤¤¤Î¤ÏÅöÁ³¤È¤·¤Æ¡¢2¤Îthreading¥â¥¸¥å¡¼¥ë¤ò»ÈÍѤ·¤ÆCPUÉé²Ù¤ÎÂ礭¤¤½èÍý¤ò¹Ô¤Ã¤¿¾ì¹ç¤Î¼Â¹Ô®ÅÙ¤¬¤«¤Ê¤êÃÙ¤¤¤Ç¤¹¡£

python¤ÎGIL¤Î±Æ¶Á¤Ç¡¢ÊÂÎó½èÍý¤¬¤Ç¤­¤Æ¤¤¤Ê¤¤»ö¤¬¸¶°ø¤Ê¤Î¤Ç¤·¤ç¤¦¡£3¤Îmultiprocessing¥â¥¸¥å¡¼¥ë¤ò»ÈÍѤ·¤¿¾ì¹ç¤ÏGIL¤ò²óÈò¤Ç¤­¤ë¤è¤¦¤Ç¤¹¡£

ÊÂÎó½èÍý¤Ç¤­¤Ê¤¤threading¥â¥¸¥å¡¼¥ë¤ò»È¤¦°ÕÌ£¤Ï¤¢¤ë¤Î¤Ç¤·¤ç¤¦¤«¡£¾¯¤Ê¤¯¤È¤â5¤Î¤è¤¦¤Ë¡¢IOÂÔ¤Á»þ´Ö¤¬Ä¹¤¤½èÍý¤òÊ£¿ô²ó¹Ô¤¦¾ì¹ç¤Ïthreading¥â¥¸¥å¡¼¥ë¤ò»ÈÍѤ¹¤ë°ÕÌ£¤Ï¤¢¤ë¤è¤¦¤Ç¤¹¡£web¥Ú¡¼¥¸¤Î¥¯¥í¡¼¥é¤Ê¤É¤Ë¤Ï¸þ¤¤¤Æ¤¤¤ë¤è¤¦¤Ç¤¹¡£

¤ä¤Ï¤êthreading¥â¥¸¥å¡¼¥ë¤è¤ê¤âmultiprocessing¥â¥¸¥å¡¼¥ë¤ò»ÈÍѤ·¤¿Êý¤¬¤¤¤¤µ¤¤¬¤¹¤ë¡£(¤â¤·¤¯¤Ïos.fork¤ò»ÈÍѤ¹¤ë¤«)

»²¹Í

17.2. multiprocessing ? Process-based parallelism — Python v3.4.0a0 documentation

2.6¤Ë¿·ÅëºÜ¤Îmultiprocessing¤ò¸«¤Æ²¶¤ÎPython¤¬¤ª¤Ã¤­¤·¤¿·ï | TRIVIAL TECHNOLOGIES @ats ¤Î¥¤¥¯¥á¥óÆüµ­

Tricorn Labs » Python 2.6 multiprocessing package ¤ò¿¨¤Ã¤Æ¤ß¤¿¡£ [GIL²óÈò]