This function reads through text and returns lists of all the different parts of speech (nouns, verbs, proper nouns, etc.). My sense is that there is a more elegant and probably more efficient way to accomplish the same thing. The code feels really repetitive so I'm hoping there is a better way.
import nltk
def find_pos(tokens):
'''This function accepts tokens as an input and returns a list of all
the parts of speech.
Note that some words are return twice:
-Nouns are separated into common and proper as well as grouped together
-Modals are added to verbs are well as returned separately'''
tagged = nltk.pos_tag(tokens)
# Now we devide them into groups
# Note that IN can be either a preposition or a conjunction, for now we're going to list it with the prepositions
common_noun_pos = ['NN', 'NNS']
common_nouns = []
verb_pos = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
verbs = []
adjective_pos = ['JJ', 'JJR', 'JJS']
adjectives = []
pronoun_pos = ['PRP', 'PRP$', 'WP', 'WP$']
pronouns = []
adverb_pos = ['RB', 'RBR', 'RBS', 'WRB']
adverbs = []
proper_noun_pos = ['NNP', 'NNPS']
proper_nouns = []
conjunction_pos = ['CC']
conjunctions = []
preposition_pos = ['IN', 'TO']
prepositions = []
interjection_pos = ['UH']
interjections = []
modal_pos = ['MD'] # But these are also verbs, include them
modals = []
tagged_other_pos = ['CD', 'DT', 'EX', 'FW', 'LS', 'PDT', 'POS', 'RP', 'SYM', 'WDT']
tagged_others = []
other = []
for idx, token in enumerate(tagged):
if token[1] in common_noun_pos:
common_nouns.append(token)
elif token[1] in verb_pos:
verbs.append(token)
elif token[1] in adjective_pos:
adjectives.append(token)
elif token[1] in pronoun_pos:
pronouns.append(token)
elif token[1] in adverb_pos:
adverbs.append(token)
elif token[1] in proper_noun_pos:
proper_nouns.append(token)
elif token[1] in conjunction_pos:
conjunctions.append(token)
elif token[1] in preposition_pos:
prepositions.append(token)
elif token[1] in interjection_pos:
interjections.append(token)
elif token[1] in modal_pos:
modals.append(token)
elif token[1] in tagged_other_pos:
tagged_others.append(token)
else:
other.append(token)
verbs.append(modals)
nouns = common_nouns + proper_nouns
parts_of_speech = [nouns, common_nouns, verbs, adjectives, pronouns, adverbs, proper_nouns, conjunctions,
prepositions, interjections, modals]
return parts_of_speech
Here some code to test it:
text = 'It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in ' \
'want of a wife. However little known the feelings or views of such a man may be on his first entering a ' \
'neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is ' \
'considered the rightful property of some one or other of their daughters. '
tokens = nltk.word_tokenize(text)
find_pos(tokens)
No comments:
Post a Comment
Thanks for your comments