-->

Welcome to our Coding with python Page!!! hier you find various code with PHP, Python, AI, Cyber, etc ... Electricity, Energy, Nuclear Power

Tuesday 3 July 2018

Python | Read text and return lists of different parts of speech

This function reads through text and returns lists of all the different parts of speech (nouns, verbs, proper nouns, etc.). My sense is that there is a more elegant and probably more efficient way to accomplish the same thing. The code feels really repetitive so I'm hoping there is a better way.

import nltk


def find_pos(tokens):

    '''This function accepts tokens as an input and returns a list of all
    the parts of speech.
    Note that some words are return twice:
    -Nouns are separated into common and proper as well as grouped together
    -Modals are added to verbs are well as returned separately'''

    tagged = nltk.pos_tag(tokens)

    # Now we devide them into groups
    # Note that IN can be either a preposition or a conjunction, for now we're going to list it with the prepositions
    common_noun_pos = ['NN', 'NNS']
    common_nouns = []
    verb_pos = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    verbs = []
    adjective_pos = ['JJ', 'JJR', 'JJS']
    adjectives = []
    pronoun_pos = ['PRP', 'PRP$', 'WP', 'WP$']
    pronouns = []
    adverb_pos = ['RB', 'RBR', 'RBS', 'WRB']
    adverbs = []
    proper_noun_pos = ['NNP', 'NNPS']
    proper_nouns = []
    conjunction_pos = ['CC']
    conjunctions = []
    preposition_pos = ['IN', 'TO']
    prepositions = []
    interjection_pos = ['UH']
    interjections = []
    modal_pos = ['MD']  # But these are also verbs, include them
    modals = []
    tagged_other_pos = ['CD', 'DT', 'EX', 'FW', 'LS', 'PDT', 'POS', 'RP', 'SYM', 'WDT']
    tagged_others = []
    other = []

    for idx, token in enumerate(tagged):
        if token[1] in common_noun_pos:
            common_nouns.append(token)
        elif token[1] in verb_pos:
            verbs.append(token)
        elif token[1] in adjective_pos:
            adjectives.append(token)
        elif token[1] in pronoun_pos:
            pronouns.append(token)
        elif token[1] in adverb_pos:
            adverbs.append(token)
        elif token[1] in proper_noun_pos:
            proper_nouns.append(token)
        elif token[1] in conjunction_pos:
            conjunctions.append(token)
        elif token[1] in preposition_pos:
            prepositions.append(token)
        elif token[1] in interjection_pos:
            interjections.append(token)
        elif token[1] in modal_pos:
            modals.append(token)
        elif token[1] in tagged_other_pos:
            tagged_others.append(token)
        else:
            other.append(token)

    verbs.append(modals)
    nouns = common_nouns + proper_nouns
    parts_of_speech = [nouns, common_nouns, verbs, adjectives, pronouns, adverbs, proper_nouns, conjunctions,
                       prepositions, interjections, modals]
    return parts_of_speech
Here some code to test it:
text = 'It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in ' \
       'want of a wife. However little known the feelings or views of such a man may be on his first entering a ' \
       'neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is ' \
       'considered the rightful property of some one or other of their daughters. '
tokens = nltk.word_tokenize(text)
find_pos(tokens)

No comments:

Post a Comment

Thanks for your comments

Rank

seo