topicModelingTickets/init.py

421 lines
10 KiB
Python
Raw Normal View History

2017-10-11 17:16:04 +02:00
# -*- coding: utf-8 -*-
2017-10-17 10:13:49 +02:00
from miscellaneous import *
2017-10-16 14:01:38 +02:00
from stop_words import get_stop_words
2017-10-17 10:13:49 +02:00
import csv
import sys
import xml.etree.ElementTree as ET
2017-10-16 14:01:38 +02:00
2017-10-11 17:16:04 +02:00
from nltk.corpus import stopwords as nltk_stopwords
2017-10-17 10:13:49 +02:00
2017-10-11 17:16:04 +02:00
from collections import Counter
2017-10-17 10:13:49 +02:00
import time
from datetime import datetime
import os
2017-10-11 17:16:04 +02:00
csv.field_size_limit(sys.maxsize)
2017-10-17 10:13:49 +02:00
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
# load config
2017-10-17 10:13:49 +02:00
config_ini = FILEPATH + "config.ini"
2017-10-11 17:16:04 +02:00
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
2017-10-16 14:01:38 +02:00
def create_lemma_dict(path2lemmalist):
"""
Creates a dict out of a txt file a la:
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
l1 w1
l1 w2
l2 w1
l2 w2
2017-10-11 17:16:04 +02:00
Result will be used as lemma_dict[word] --> lemma
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
:param path2lemmalist: str
:return: dictionary
"""
file_gen = textacy.fileio.read_file_lines(path2lemmalist)
lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(file_gen)))
2017-10-11 17:16:04 +02:00
lemma_dict = {}
for line in lemmalist:
lem_word_pair = line.split()
lemma = lem_word_pair[0].strip().lower()
word = lem_word_pair[1].strip().lower()
lemma_dict[word] = lemma
return lemma_dict
2017-10-16 14:01:38 +02:00
def build_thesaurus_dict(path2wordnet,returnall=False):
"""
Creates a dict out of the deWordNet
https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml
2017-10-11 17:16:04 +02:00
Result will be used as thesaurus[word] --> main_synonym
2017-10-17 10:13:49 +02:00
:param path2wordnet: str
2017-10-16 14:01:38 +02:00
:param returnall: bool if True, also return , word2synsets, synset2Words
:return: dictionaries: thesaurus
"""
lextree = ET.parse(path2wordnet, ET.XMLParser(encoding="utf-8"))
lexroot = lextree.getroot()
# Build word2synsets
word2synsets = {}
template = {"w1": ["s1", "s2"]}
for ro in lexroot:
for elem in ro:
if elem.tag == "LexicalEntry":
lex_dictlist = [subentry.attrib for subentry in elem]
2017-12-08 11:06:07 +01:00
# idee technischer thesaurus
synlist = []
string = "WORD"
for lex_dict in lex_dictlist:
if "synset" in lex_dict.keys():
synset = lex_dict["synset"]
synlist.append(synset)
if 'writtenForm' in lex_dict.keys():
string = (lex_dict["writtenForm"])
2017-10-16 14:01:38 +02:00
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[Ö]', "Oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[Ü]', "Ue", string)
string = re.sub(r'[ä]', "ae", string)
string = re.sub(r'[Ä]', "ae", string)
# alle punkte raus
string = re.sub(r'[.]', "", string)
# alles in klammern raus
if "auptform" in string:
string = re.sub(r"\((.*)\)", " ", string)
string = string + " (hauptform)" # evtl. als hauptform merken
else:
string = re.sub(r"\((.*)\)", " ", string)
# längeres leerzeichen normalisieren
string = textacy.preprocess.normalize_whitespace(string)
string = string.strip()#.lower()
if string != '':
word2synsets[string] = synlist
2017-10-16 14:01:38 +02:00
# Build synset2Words
synset2Words = {}
template = {"s1": ["w1","w2"]}
for word,synset in word2synsets.items():
2017-10-16 14:01:38 +02:00
if word != '':
2017-10-16 14:01:38 +02:00
for syn in synset:
if syn not in synset2Words.keys():
synset2Words[syn] = [word]
else:
synset2Words[syn].append(word)
# Sortieren
for words in synset2Words.values():
words.sort(key=lambda w: len(w.split())) # nach anzhal der wörter in den strings (weniger nach vorne)
for w in words:
if "(hauptform)" in w:
to_insert = re.sub(r"\((.*)\)", " ", w).strip()
words.remove(w)
words.insert(0, to_insert) # Hauptform evtl. nach vorne
thesaurus = {}
thesaurus_template = {"w1" : "mainsyn"}
# word --> [synset1, synset2, .. ] --> synset1 --> [syn1, syn2, ... ] --> syn1 / mainsyn
2017-10-16 14:01:38 +02:00
for word,synsets in word2synsets.items(): #word , [synset1, synset2, .. ]
try:
if "Passwort" in word:
x=2
first_synset = synsets[0] #erstes synset wählen . praktischer Grund
syns = synset2Words[first_synset] # [syn1, syn2, ... ]
first_syn = syns[0] # erstes synonym (evtl. Hauptform) wählen
word = re.sub(r"\((.*)\)", " ", word).strip() #(hautpform weg)
thesaurus[word] = first_syn #Ann.: erstes synonym ist das Hauptsynonym
except:
pass
2017-10-16 14:01:38 +02:00
if returnall:
return thesaurus, word2synsets, synset2Words
else:
return thesaurus
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
def create_stopword_lists(*paths):
"""
creates a list of stoppwords from:
spacy
nltk
stop_words
:param paths: list of additional filepaths where each file looks like
w1
w2
w3
filenames must be a la de_stopwords_1.txt, en_stopwords_2.txt
:return: lists: de_stopwords, en_stopwords
"""
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
## GERMAN
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
# from packages
de_stop_words1 = list(get_stop_words("de"))
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
de_stop_words2 = list(nltk_stopwords.words('german'))
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
de_stop_words3 = list(__import__("spacy.de", globals(), locals(), ['object']).STOP_WORDS)
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
#from files
de_filepaths = []
for path in paths:
if os.path.basename(path).split("_")[0] == 'de' and os.path.basename(path).split("_")[
1] == 'stopwords':
de_filepaths.append(path)
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
de_stop_words4 = list_from_files(*de_filepaths)
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
#combine everything
2017-12-08 11:06:07 +01:00
de_stop_words = list(set(map(replaceRockDots_lambda(), list(map(textacy.preprocess.normalize_whitespace,
de_stop_words1 + de_stop_words2 + de_stop_words3 + de_stop_words4)))))
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
## ENGLISH
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
# from packages
en_stop_words1 = list(get_stop_words("en"))
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
en_stop_words2 = list(nltk_stopwords.words('english'))
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
en_stop_words3 = list(__import__("spacy.en", globals(), locals(), ['object']).STOP_WORDS)
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
# from files
en_filepaths = [path for path in paths if
os.path.basename(path).split("_")[0] == 'en' and os.path.basename(path).split("_")[
1] == 'stopwords']
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
en_stop_words4 = list_from_files(*en_filepaths)
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
# combine everything
2017-12-08 11:06:07 +01:00
en_stop_words = list(set(map(replaceRockDots_lambda(), list(map(textacy.preprocess.normalize_whitespace,
en_stop_words1 + en_stop_words2 + en_stop_words3 + en_stop_words4)))))
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
return de_stop_words, en_stop_words
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
def build_words_for_spellchecking(path2words):
"""
create word-Counter for spellchecking
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
http://norvig.com/spell-correct.html
http://wortschatz.uni-leipzig.de/en/download
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
http://pcai056.informatik.uni-leipzig.de/downloads/corpora/deu_news_2015_1M.tar.gz
:return: Counter
"""
def words(text): return re.findall(r'\w+', text.lower())
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
return Counter(words(open(path2words).read()))
2017-10-11 17:16:04 +02:00
2017-10-16 14:01:38 +02:00
##################################################################################################
2017-10-11 17:16:04 +02:00
def main():
start = time.time()
logprint("Init")#: {0}".format(datetime.now()))
2017-10-11 17:16:04 +02:00
ressources_path = FILEPATH + "ressources/"
2017-10-11 17:16:04 +02:00
2017-10-18 17:37:20 +02:00
2017-10-11 17:16:04 +02:00
# THESAURUS
logprint("Build and save Thesaurus")
path2wordnet = ressources_path + config.get("thesaurus", "input")
thesaurus = build_thesaurus_dict(path2wordnet)
path2thesaurus_dict = ressources_path + config.get("thesaurus", "pickle_file")
save_obj(thesaurus, path2thesaurus_dict)
2017-10-11 17:16:04 +02:00
# LEMMA
2017-10-25 09:46:44 +02:00
logprint("create and save lemma_dict")
path2lemma_file = ressources_path + config.get("lemmatization", "input")
2017-10-16 14:01:38 +02:00
lemma_dict = create_lemma_dict(path2lemma_file)
path2lemmadict = ressources_path + config.get("lemmatization", "pickle_file")
2017-10-16 14:01:38 +02:00
save_obj(lemma_dict, path2lemmadict)
2017-10-11 17:16:04 +02:00
# SPELLCHECKING
2017-10-25 09:46:44 +02:00
logprint("Build and save Wordlist for Spellchecking")
path2words_file = ressources_path + config.get("spellchecking", "input")
2017-10-16 14:01:38 +02:00
words = build_words_for_spellchecking(path2words_file)
path2words_counter = ressources_path + config.get("spellchecking", "pickle_file")
save_obj(words, path2words_counter)
2017-10-11 17:16:04 +02:00
# STOPWORDS
2017-10-25 09:46:44 +02:00
logprint("Build and save stoppwortliste")
stop1 = ressources_path + config.get("de_stopwords", "input1")
stop2 = ressources_path + config.get("de_stopwords", "input2")
stop3 = ressources_path + config.get("de_stopwords", "input3")
2017-10-18 17:37:20 +02:00
de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3)
path2stopwordlist_de = ressources_path + config.get("de_stopwords", "pickle_file")
2017-10-18 17:37:20 +02:00
save_obj(de_stop_words, path2stopwordlist_de)
path2stopwordlist_en = ressources_path + config.get("en_stopwords", "pickle_file")
2017-10-18 17:37:20 +02:00
save_obj(en_stop_words, path2stopwordlist_en)
2017-10-11 17:16:04 +02:00
# NOMEN
2017-10-25 09:46:44 +02:00
logprint("Build and save nomenliste")
nouns0 = ressources_path + config.get("nouns", "input")
nouns1 = ressources_path + config.get("nouns", "input1")
nouns2 = ressources_path + config.get("nouns", "input2")
nouns = list_from_files(nouns0,nouns1,nouns2)
path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
2017-10-16 14:01:38 +02:00
save_obj(nouns, path2nouns_list)
2017-10-11 17:16:04 +02:00
# VORNAMEN
2017-10-25 09:46:44 +02:00
logprint("Build and save firstnameslist")
firstnames_txt = ressources_path + config.get("firstnames", "input")
2017-10-16 14:01:38 +02:00
vornamen = list_from_files(firstnames_txt)
path2firstnameslist = ressources_path + config.get("firstnames", "pickle_file")
2017-10-16 14:01:38 +02:00
save_obj(vornamen, path2firstnameslist)
2017-10-11 17:16:04 +02:00
2017-10-11 17:16:04 +02:00
end = time.time()
2017-10-25 09:46:44 +02:00
logprint("Time Elapsed Initialization:{0} min".format((end - start) / 60))
2017-10-11 17:16:04 +02:00
if __name__ == "__main__":
main()