refactoring.

jetzt kommt der umbau cleanedcoprus --> doctermmatrix --LDA & labaled_lines.txt --> LLDA
This commit is contained in:
jannis.grundmann 2017-12-11 12:10:40 +01:00
parent db7ea1a72a
commit 412f25d8d8
16 changed files with 340 additions and 126 deletions

View File

@ -30,6 +30,16 @@ with open(config_ini) as f:
def clean(stringstream):#, NOUNS): def clean(stringstream):#, NOUNS):
"""
fix bad unicode
seperate_words_on_regex `\=~%^&*()_+\[\]{};\'"|</>
normalize whitespace
remove linebreaks
replaceRockDöts
:param stringstream: str-gen
:return: string-gen
"""
#NOUNS = [n.lower() for n in NOUNS] #NOUNS = [n.lower() for n in NOUNS]
@ -90,19 +100,22 @@ corpus_de_path = FILEPATH + config.get("de_corpus", "path")
def cleanCorpus(corpus): def cleanCorpus(corpus):
logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now())) logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now()))
"""
ressources_path = FILEPATH + "ressources/" ressources_path = FILEPATH + "ressources/"
path2nouns_list = ressources_path + config.get("nouns", "pickle_file") path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
#NOUNS = load_obj(path2nouns_list) #NOUNS = load_obj(path2nouns_list)
#noun_disjunction = '|'.join(NOUNS) #noun_disjunction = '|'.join(NOUNS)
#nouns_tuples = [] #nouns_tuples = []
#for n in NOUNS: #for n in NOUNS:
# nouns_tuples.append((n.lower(),n)) # nouns_tuples.append((n.lower(),n))
"""
cleanCorpus_name = corpus.lang + "_clean"
# load Corpus
raw_corpus = corpus raw_corpus = corpus
parser = corpus.spacy_lang parser = corpus.spacy_lang
@ -115,13 +128,14 @@ def cleanCorpus(corpus):
) )
# leere docs aus corpi kicken # leere docs aus corpus kicken
cleaned_corpus.remove(lambda doc: len(doc) == 0) cleaned_corpus.remove(lambda doc: len(doc) == 0)
#save corpus #save corpus
cleanCorpus_name = corpus.lang + "_clean"
save_corpus(corpus=cleaned_corpus, corpus_path=corpus_de_path, corpus_name=cleanCorpus_name) save_corpus(corpus=cleaned_corpus, corpus_path=corpus_de_path, corpus_name=cleanCorpus_name)

View File

@ -90,7 +90,16 @@ corpus_de_path = FILEPATH + config.get("de_corpus", "path")
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0): def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, lang, printrandom=0):
"""
Use textacy to create a Corpus out of the ITMC-Ticket.csv
:param path2_csv: str
:param corpus_path: str
:param content_collumn_name: str the Collumn which is used as the Docs text
:param lang: str standard 2-letter language
:param printrandom: print n random Documents
:return: textacy.Corpus
"""
# print paths # print paths
path_csv_split = path2_csv.split("/") path_csv_split = path2_csv.split("/")

215
init.py
View File

@ -28,20 +28,20 @@ with open(config_ini) as f:
def create_lemma_dict(path2lemmalist): def create_lemma_dict(path2lemmalist):
""" """
Creates a dict out of a file a la: Creates a dict out of a txt file a la:
l1 w1 l1 w1
l1 w2 l1 w2
l2 w1 l2 w1
l2 w2 l2 w2
Result will be used as lemma_dict["word"] --> lemma Result will be used as lemma_dict[word] --> lemma
:param path2lemmalist: str :param path2lemmalist: str
:return: dictionary :return: dictionary
""" """
lemmalist = list(map(textacy.preprocess.normalize_whitespace, list( file_gen = textacy.fileio.read_file_lines(path2lemmalist)
textacy.fileio.read_file_lines(path2lemmalist)))) lemmalist = list(map(textacy.preprocess.normalize_whitespace, list(file_gen)))
lemma_dict = {} lemma_dict = {}
@ -63,7 +63,7 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
Creates a dict out of the deWordNet Creates a dict out of the deWordNet
https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml https://raw.githubusercontent.com/hdaSprachtechnologie/odenet/master/deWordNet.xml
Result will be used as lemma_dict["word"] --> lemma Result will be used as thesaurus[word] --> main_synonym
:param path2wordnet: str :param path2wordnet: str
:param returnall: bool if True, also return , word2synsets, synset2Words :param returnall: bool if True, also return , word2synsets, synset2Words
@ -73,6 +73,7 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
lexroot = lextree.getroot() lexroot = lextree.getroot()
# Build word2synsets
word2synsets = {} word2synsets = {}
template = {"w1": ["s1", "s2"]} template = {"w1": ["s1", "s2"]}
@ -82,7 +83,6 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
lex_dictlist = [subentry.attrib for subentry in elem] lex_dictlist = [subentry.attrib for subentry in elem]
# idee technischer thesaurus # idee technischer thesaurus
# idee hauptsynonmy muss einzelnes wort sein
synlist = [] synlist = []
string = "WORD" string = "WORD"
@ -96,55 +96,92 @@ def build_thesaurus_dict(path2wordnet,returnall=False):
if 'writtenForm' in lex_dict.keys(): if 'writtenForm' in lex_dict.keys():
string = (lex_dict["writtenForm"]) string = (lex_dict["writtenForm"])
if string == "Kennwort":
pass
# replaceRockDots # replaceRockDots
string = re.sub(r'[ß]', "ss", string) string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string) string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[Ö]', "Oe", string)
string = re.sub(r'[ü]', "ue", string) string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[Ü]', "Ue", string)
string = re.sub(r'[ä]', "ae", string) string = re.sub(r'[ä]', "ae", string)
string = re.sub(r'[Ä]', "ae", string)
# alle punkte raus # alle punkte raus
string = re.sub(r'[.]', "", string) string = re.sub(r'[.]', "", string)
# alles in klammern raus # alles in klammern raus
string = re.sub(r"\((.*)\)", " ", string) if "auptform" in string:
string = re.sub(r"\((.*)\)", " ", string)
string = string + " (hauptform)" # evtl. als hauptform merken
else:
string = re.sub(r"\((.*)\)", " ", string)
# längeres leerzeichen normalisieren # längeres leerzeichen normalisieren
string = textacy.preprocess.normalize_whitespace(string) string = textacy.preprocess.normalize_whitespace(string)
string = string.lower().strip() string = string.strip()#.lower()
word2synsets[string] = synlist if string != '':
word2synsets[string] = synlist
# Build synset2Words
synset2Words = {} synset2Words = {}
template = {"s1": ["w1","w2"]} template = {"s1": ["w1","w2"]}
for word,synset in word2synsets.items(): for word,synset in word2synsets.items():
if word != '': if word != '':
for syn in synset: for syn in synset:
if syn not in synset2Words.keys(): if syn not in synset2Words.keys():
synset2Words[syn] = [word] synset2Words[syn] = [word]
else: else:
synset2Words[syn].append(word) synset2Words[syn].append(word)
# nach anzhal der wörter in den strings sortieren
for synset in word2synsets.values(): # Sortieren
synset.sort(key=lambda x: len(x.split())) for words in synset2Words.values():
words.sort(key=lambda w: len(w.split())) # nach anzhal der wörter in den strings (weniger nach vorne)
for w in words:
if "(hauptform)" in w:
to_insert = re.sub(r"\((.*)\)", " ", w).strip()
words.remove(w)
words.insert(0, to_insert) # Hauptform evtl. nach vorne
thesaurus = {} thesaurus = {}
thesaurus_template = {"w1" : "mainsyn"} thesaurus_template = {"w1" : "mainsyn"}
# word --> [synset1, synset2, .. ] --> synset1 --> [syn1, syn2, ... ] --> syn1 / mainsyn
for word,synset in word2synsets.items(): for word,synsets in word2synsets.items(): #word , [synset1, synset2, .. ]
try: try:
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym #todo nach (hauptform) suchen? if "Passwort" in word:
x=2
first_synset = synsets[0] #erstes synset wählen . praktischer Grund
syns = synset2Words[first_synset] # [syn1, syn2, ... ]
first_syn = syns[0] # erstes synonym (evtl. Hauptform) wählen
word = re.sub(r"\((.*)\)", " ", word).strip() #(hautpform weg)
thesaurus[word] = first_syn #Ann.: erstes synonym ist das Hauptsynonym
except: except:
pass pass
if returnall: if returnall:
return thesaurus, word2synsets, synset2Words return thesaurus, word2synsets, synset2Words
else: else:
@ -237,39 +274,8 @@ def build_words_for_spellchecking(path2words):
################################################################################################## ##################################################################################################
# THESAURUS
ressources_path = FILEPATH + "ressources/"
path2wordnet = ressources_path + config.get("thesaurus","input")
path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file")
# SPELLCHECKING
path2words_file = ressources_path + config.get("spellchecking","input")
path2wordlist = ressources_path + config.get("spellchecking","pickle_file")
# LEMMA
path2lemma_file = ressources_path + config.get("lemmatization","input")
path2lemmadict = ressources_path + config.get("lemmatization","pickle_file")
# NOMEN
nouns0 = ressources_path + config.get("nouns","input")
nouns1 = ressources_path + config.get("nouns","input1")
nouns2 = ressources_path + config.get("nouns","input2")
path2nouns_list = ressources_path + config.get("nouns","pickle_file")
# VORNAMEN
firstnames_txt = ressources_path + config.get("firstnames","input")
path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
# STOPWORDS
stop1 = ressources_path + config.get("de_stopwords","input1")
stop2 = ressources_path + config.get("de_stopwords","input2")
stop3 = ressources_path + config.get("de_stopwords","input3")
path2stopwordlist_de = ressources_path + config.get("de_stopwords","pickle_file")
path2stopwordlist_en = ressources_path + config.get("en_stopwords","pickle_file")
@ -277,44 +283,135 @@ def main():
start = time.time() start = time.time()
logprint("Init: {0}".format(datetime.now())) logprint("Init: {0}".format(datetime.now()))
ressources_path = FILEPATH + "ressources/"
""""""
logprint("create and save lemma_dict")
lemma_dict = create_lemma_dict(path2lemma_file)
save_obj(lemma_dict, path2lemmadict)
logprint("Build and save Wordlist for Spellchecking")
words = build_words_for_spellchecking(path2words_file)
save_obj(words, path2wordlist)
# THESAURUS
logprint("Build and save Thesaurus") logprint("Build and save Thesaurus")
path2wordnet = ressources_path + config.get("thesaurus", "input")
thesaurus = build_thesaurus_dict(path2wordnet) thesaurus = build_thesaurus_dict(path2wordnet)
path2thesaurus_dict = ressources_path + config.get("thesaurus", "pickle_file")
save_obj(thesaurus, path2thesaurus_dict) save_obj(thesaurus, path2thesaurus_dict)
# LEMMA
logprint("create and save lemma_dict")
path2lemma_file = ressources_path + config.get("lemmatization", "input")
lemma_dict = create_lemma_dict(path2lemma_file)
path2lemmadict = ressources_path + config.get("lemmatization", "pickle_file")
save_obj(lemma_dict, path2lemmadict)
# SPELLCHECKING
logprint("Build and save Wordlist for Spellchecking")
path2words_file = ressources_path + config.get("spellchecking", "input")
words = build_words_for_spellchecking(path2words_file)
path2words_counter = ressources_path + config.get("spellchecking", "pickle_file")
save_obj(words, path2words_counter)
# STOPWORDS
logprint("Build and save stoppwortliste") logprint("Build and save stoppwortliste")
stop1 = ressources_path + config.get("de_stopwords", "input1")
stop2 = ressources_path + config.get("de_stopwords", "input2")
stop3 = ressources_path + config.get("de_stopwords", "input3")
de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3) de_stop_words, en_stop_words = create_stopword_lists(stop1, stop2, stop3)
path2stopwordlist_de = ressources_path + config.get("de_stopwords", "pickle_file")
save_obj(de_stop_words, path2stopwordlist_de) save_obj(de_stop_words, path2stopwordlist_de)
path2stopwordlist_en = ressources_path + config.get("en_stopwords", "pickle_file")
save_obj(en_stop_words, path2stopwordlist_en) save_obj(en_stop_words, path2stopwordlist_en)
# NOMEN
logprint("Build and save nomenliste") logprint("Build and save nomenliste")
#nouns = list_from_files(nouns1,nouns2)
nouns = list_from_files(nouns0) nouns0 = ressources_path + config.get("nouns", "input")
nouns1 = ressources_path + config.get("nouns", "input1")
nouns2 = ressources_path + config.get("nouns", "input2")
nouns = list_from_files(nouns0,nouns1,nouns2)
path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
save_obj(nouns, path2nouns_list) save_obj(nouns, path2nouns_list)
# VORNAMEN
logprint("Build and save firstnameslist") logprint("Build and save firstnameslist")
firstnames_txt = ressources_path + config.get("firstnames", "input")
vornamen = list_from_files(firstnames_txt) vornamen = list_from_files(firstnames_txt)
path2firstnameslist = ressources_path + config.get("firstnames", "pickle_file")
save_obj(vornamen, path2firstnameslist) save_obj(vornamen, path2firstnameslist)
end = time.time() end = time.time()
logprint("Time Elapsed Initialization:{0} min".format((end - start) / 60)) logprint("Time Elapsed Initialization:{0} min".format((end - start) / 60))

View File

@ -30,15 +30,13 @@ start = time.time()
# todo modelle testen # todo modelle testen
# todo ticket2kbkeys, subj, cats in init.py
logprint("main.py started at {}".format(datetime.now())) logprint("main.py started at {}".format(datetime.now()))
init.main()
#init.main()
logprint("") logprint("")
raw_corpus = corporization.main() raw_corpus = corporization.main()

View File

@ -217,7 +217,6 @@ def save_corpus(corpus, corpus_path, corpus_name):
:param corpus_path: str :param corpus_path: str
:param corpus_name: str (should content the language like "_de_") :param corpus_name: str (should content the language like "_de_")
""" """
#todo pos und ner tagging speichern
# save parser # save parser
parser = corpus.spacy_lang parser = corpus.spacy_lang

View File

@ -126,7 +126,7 @@ def remove_first_names():
def remove_addresses(string): def remove_addresses(string):
pass # todo remove_addresses idee postal.parser und zu metadaten hinzufügen pass # todo remove_addresses idee postal.parser und zu metadaten hinzufügen
def lemmatizeWord(word,lemma_dict=LEMMAS,n=3): def lemmatizeWord(word,lemma_dict=LEMMAS,n=5):
for i in range(n): for i in range(n):
try: try:
word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower() word = lemma_dict[word.lower()] if word.lower() in lemma_dict.keys() else word.lower()
@ -134,26 +134,29 @@ def lemmatizeWord(word,lemma_dict=LEMMAS,n=3):
print(word) print(word)
return word return word
def getFirstSynonym(word, thesaurus=THESAURUS,n=3):
def getFirstSynonym(word, thesaurus=THESAURUS, n=3):
for i in range(n): for i in range(n):
try: try:
word = thesaurus[word.lower()] if word.lower() in thesaurus.keys() else word.lower() if word in thesaurus.keys():
return thesaurus[word]
elif word.title() in thesaurus.keys():
return thesaurus[word.title()]
elif word.lower() in thesaurus.keys():
return thesaurus[word.lower()]
else:
return word
except: except:
print(word) print("THESAURUSFEHLER BEI: {}".format(word))
return word return word
"""
if not isinstance(word, str):
return str(word)
word = word.lower()
if word in thesaurus.keys():
return thesaurus[word]
else:
return str(word)
"""
########################## Spellchecking ########################################## ########################## Spellchecking ##########################################
@ -328,6 +331,15 @@ corpus_en_path = FILEPATH + config.get("en_corpus", "path")
def extract_from_corpus(corpus): def extract_from_corpus(corpus):
"""
Extract from each doc from a corpus a string containing disired token_texts
:param corpus: textacy.Corpus
:return: string-gen
"""
# WHITELIST erstellen. Enthält zumindest die evtuellen Topics
WHITELIST = ["boss", "sap", "firefox"] #todo autogenerierung relv. techn. begriffe WHITELIST = ["boss", "sap", "firefox"] #todo autogenerierung relv. techn. begriffe
@ -337,6 +349,7 @@ def extract_from_corpus(corpus):
WHITELIST = WHITELIST + kb_cats + kb_keys + kb_subjs WHITELIST = WHITELIST + kb_cats + kb_keys + kb_subjs
THESAURUS = load_obj(path2thesaurus_dict) THESAURUS = load_obj(path2thesaurus_dict)
#WORDS = load_obj(path2wordsdict) #WORDS = load_obj(path2wordsdict)
LEMMAS = load_obj(path2lemmadict) LEMMAS = load_obj(path2lemmadict)
@ -344,6 +357,9 @@ def extract_from_corpus(corpus):
#EN_STOP_WORDS = load_obj(path2ENstopwordlist) #EN_STOP_WORDS = load_obj(path2ENstopwordlist)
VORNAMEN = load_obj(path2firstnameslist) VORNAMEN = load_obj(path2firstnameslist)
ents_boss = []
ents_sap = []
for doc in corpus: for doc in corpus:
result = [] result = []
@ -353,10 +369,16 @@ def extract_from_corpus(corpus):
for tok in doc: for tok in doc:
if tok.lower_ =="boss" or tok.lower_ =="sap":
print(tok.lower_+": "+tok.ent_type_)
"""
if tok.lower_ =="boss":
ents_boss.append(tok.ent_type_)
if tok.lower_ =="sap":
ents_sap.append(tok.ent_type_)
"""
# wenn in whitelist, direkt übernehmen
if tok.lower_ in WHITELIST: if tok.lower_ in WHITELIST:
result.append(tok.lower_) result.append(tok.lower_)
@ -372,25 +394,27 @@ def extract_from_corpus(corpus):
or tok.lower_ in VORNAMEN: or tok.lower_ in VORNAMEN:
continue continue
# cut after footer
if replaceRockDots(tok.lower_) in ["gruss", "grusse", "gruesse", "gruessen", "grusses"]: # fehler schneidet bei INC40506 das meiste weg
break
# boss/SAP ent_type = 'ORG' oder '' (ein-weimal LOC oder PERSON)
# cut after footer
if replaceRockDots(tok.lower_) in ["gruss", "grusse", "gruesse", "gruessen", "grusses"]: # fehler schneidet bei zB INC40506 das meiste weg
break
if tok.pos_ in ["NOUN"] \ if tok.pos_ in ["NOUN"] \
or tok.ent_type_ in ["NORP","FACILITY","ORG","PRODUCT","WORK_OF_ART"]: or tok.ent_type_ in ["NORP","FACILITY","ORG","PRODUCT","WORK_OF_ART","LOC"]:
#or tok.dep_ == "ROOT": #or tok.dep_ == "ROOT":
# or tok.lower_ in NOUNS \ #,"PERSON"] \ # or tok.lower_ in NOUNS \ #,"PERSON"] \
toktext = tok.lower_ toktext = tok.lower_
toktext = lemmatized_word toktext = lemmatized_word
"""
# hauptsynonym bilden idee zwar das Huaptsyn bilden und zählen aber die originalen wörter in den llda algo geben
"""
first_synonym = getFirstSynonym(toktext, thesaurus=THESAURUS) first_synonym = getFirstSynonym(toktext, thesaurus=THESAURUS)
if first_synonym is not None: if first_synonym is not None or first_synonym != '':
toktext = first_synonym if len(first_synonym.split()) == 1 else toktext toktext = first_synonym if len(first_synonym.split()) == 1 else toktext
""" """
@ -402,6 +426,14 @@ def extract_from_corpus(corpus):
yield " ".join(result) yield " ".join(result)
"""
print(list(set(ents_sap)))
['', 'ORG', 'PERSON', 'LOC']
print(list(set(ents_boss)))
['', 'ORG', 'PERSON', 'LOC']
"""
@ -433,6 +465,9 @@ def preprocessCorpus(corpus, clean_in_meta):
) )
# idee labeled_lines.txt enthählt bigramme mit unterstrich
# todo preCorpus weg. llda bekommt labaled_lines.txt und lda doctermamtrix
# leere docs aus corpi kicken # leere docs aus corpi kicken
pre_corpus.remove(lambda doc: len(doc) == 0) pre_corpus.remove(lambda doc: len(doc) == 0)

View File

@ -71439,7 +71439,7 @@
</Sense> </Sense>
</LexicalEntry> </LexicalEntry>
<LexicalEntry id="w10531"> <LexicalEntry id="w10531">
<Lemma writtenForm="Passwort" partOfSpeech="n"/> <Lemma writtenForm="Passwort (Hauptform)" partOfSpeech="n"/>
<Sense id="w10531_2177-n" synset="de-2177-n"> <Sense id="w10531_2177-n" synset="de-2177-n">
</Sense> </Sense>
</LexicalEntry> </LexicalEntry>
@ -750689,4 +750689,4 @@
</Synset> </Synset>
</Lexicon> </Lexicon>
</LexicalResource> </LexicalResource>

View File

@ -1,3 +1,5 @@
kennwort kennworts
kennwort kennwortes
a as a as
aachen aachens aachen aachens
aal aale aal aale
@ -358471,4 +358473,4 @@ zynisch zynischstes
zynische zynischen zynische zynischen
zynischere zynischeren zynischere zynischeren
zynischste zynischsten zynischste zynischsten
zyste zysten zyste zysten

110
test.py

File diff suppressed because one or more lines are too long

View File

@ -571,14 +571,14 @@ def jgibbsLLDA_KB_v2(corpus, path2save_results, top_topic_words = 7):
count_dict[kb] = 1 count_dict[kb] = 1
sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1)) sorted_dict = sorted(count_dict.items(), key=operator.itemgetter(1))
"""
for k,v in sorted_dict: for k,v in sorted_dict:
subs = kb2subjects_dict[k] subs = kb2subjects_dict[k]
keys = kb2keywords_dict[k] keys = kb2keywords_dict[k]
print(subs, keys , v) # frage wieviele tickets pro topic? print(subs, keys , v) # frage wieviele tickets pro topic?
print("kb_entrys used: {}".format(len(sorted_dict))) # frage wie viele kb_entry's insg genutzt?: 155 print("kb_entrys used: {}".format(len(sorted_dict))) # frage wie viele kb_entry's insg genutzt?: 155
"""
labelist = ticket2keywords_dict.values() labelist = ticket2keywords_dict.values()
@ -644,7 +644,7 @@ def load_from_labled_lines(path):
#idee plan #idee plan
# clean laden, pre laden # clean laden, pre laden
# unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee notfalls bigramme als geklammerte "wörter" # unigramme und num/wort-bigramme doc-term # frage wie geht llda mit bigrammen um? idee bigramme mit _ verbinden
# nimm nur ngrams wo midn. ein token in pre vorkommt # nimm nur ngrams wo midn. ein token in pre vorkommt