60 lines
3.3 KiB
Python
60 lines
3.3 KiB
Python
'''
|
|
Named Entity Recognition (NER)
|
|
==============================
|
|
|
|
NER takes a text as input and searches for names of persons, companies
|
|
and countries.
|
|
'''
|
|
from nltk import ne_chunk, pos_tag, sent_tokenize, word_tokenize
|
|
from nltk.tree import Tree
|
|
|
|
''' TODO: falsch klassifiert:
|
|
[('PERSON', 'Bangkok '), ('PERSON', 'Krung Thai Bank Pcl '),
|
|
('PERSON', 'Maybank Kim Eng Securities '), ('PERSON', 'Krung Thai Bank '),
|
|
('PERSON', 'Siam Commercial Bank '), ('PERSON', 'Singapore '),
|
|
('PERSON', 'Keppel Corp '), ('ORGANIZATION', 'Companies ')]
|
|
'''
|
|
|
|
class NER:
|
|
|
|
def get_ne_with_label(text):
|
|
labels = []
|
|
names = []
|
|
# TODO: letztes Wort wird nicht erkannt
|
|
for chunk in ne_chunk(pos_tag(word_tokenize(text + 'lastword.'))):
|
|
if hasattr(chunk, 'label'):
|
|
name = ''
|
|
for c in chunk:
|
|
name += c[0] + ' '
|
|
if name not in names:
|
|
names.append(name.strip())
|
|
labels.append(chunk.label())
|
|
#print(chunk.label(), ' '.join(c[0] for c in chunk))
|
|
return list(zip(labels, names))
|
|
|
|
test_article = '''BANGKOK, Sept 22 (Reuters) - Southeast Asian stock markets
|
|
\nmostly fell in light volumes on Tuesday as energy shares
|
|
tracked \nfalls in global oil prices, while weaknesses in banking shares
|
|
\namid concerns about loans to an ailing steel firm sent the Thai
|
|
\nindex to a one-week closing low. \nBangkok's SET index shed nearly
|
|
1 percent after four \nsessions of gains. The index closed at 1,379.32,
|
|
its lowest \nclosing since Sept. 15. \nShares of Krung Thai Bank Pcl,
|
|
the most actively \ntraded by turnover, dropped 2.8 percent to a near
|
|
one-month low, \nreflecting potential impact of loans to Sahaviriya Steel
|
|
\nIndustries Pcl on the bank's earnings. \nMaybank Kim Eng Securities
|
|
downgraded Krung Thai Bank to \n\"hold\" from \"buy\". \n\"Even as exposure
|
|
to SSI loans will be fully provisioned, \nKTB's NPL coverage will still be
|
|
lower than 130 percent, the \ndesired level we think and hence the need for
|
|
more provisioning \nin the following quarters,\" the broker said in a report.
|
|
\nSSI shares plunged 20 percent and Siam Commercial Bank \n, among its
|
|
creditors, dropped 1 percent. The steel firm \nand its three creditors
|
|
agreed on Monday to consider options to \nrestructure debt worth over
|
|
50 billion baht ($1.40 \nbillion). \nStocks in Malaysia extended their
|
|
slides for a third \nsession, Singapore gave up early gains and Indonesia
|
|
\nhit a near one-week low, all with trading volumes below \nthe 30-day
|
|
average ahead of a public holiday on Thursday. \nAmong top losers in the
|
|
region, Indonesia's Perusahaan Gas \nNegara was down 4.4 percent and
|
|
Singapore's Keppel \nCorp was down 2.5 percent as crude oil prices fell
|
|
\namid uncertainty over global demand. \nFor Asian Companies click.'''
|
|
|
|
print(NER.get_ne_with_label(test_article)) |