lauffähige version

This commit is contained in:
jannis.grundmann 2017-11-06 12:54:59 +01:00
parent ecc8c0c54a
commit 0a6a68b8aa
45 changed files with 826 additions and 1175985 deletions

383
backup.py
View File

@ -1,383 +0,0 @@
# -*- coding: utf-8 -*-
############# misc
def printlog(string, level="INFO"):
"""log and prints"""
print(string)
if level=="INFO":
logging.info(string)
elif level=="DEBUG":
logging.debug(string)
elif level == "WARNING":
logging.warning(string)
printlog("Load functions")
def compose(*functions):
def compose2(f, g):
return lambda x: f(g(x))
return functools.reduce(compose2, functions, lambda x: x)
def get_calling_function():
"""finds the calling function in many decent cases.
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
"""
fr = sys._getframe(1) # inspect.stack()[1][0]
co = fr.f_code
for get in (
lambda:fr.f_globals[co.co_name],
lambda:getattr(fr.f_locals['self'], co.co_name),
lambda:getattr(fr.f_locals['cls'], co.co_name),
lambda:fr.f_back.f_locals[co.co_name], # nested
lambda:fr.f_back.f_locals['func'], # decorators
lambda:fr.f_back.f_locals['meth'],
lambda:fr.f_back.f_locals['f'],
):
try:
func = get()
except (KeyError, AttributeError):
pass
else:
if func.__code__ == co:
return func
raise AttributeError("func not found")
def printRandomDoc(textacyCorpus):
import random
print()
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
printlog("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
print()
############# load xml
def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
"""
generates strings from XML
:param path2xml:
:param main_textfield:
:param cleaning_function:
:yields strings
"""
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
for field in ticket:
if field.tag == main_textfield:
yield field.text
def generateMetadatafromTicketXML(path2xml, leave_out=['Description']):
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
metadata = {}
for field in ticket:
if field.tag not in leave_out:
metadata[field.tag] = field.text
yield metadata
############# load csv
def csv_to_contentStream(path2csv: str, content_collumn_name: str):
"""
:param path2csv: string
:param content_collumn_name: string
:return: string-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
content_collumn = 0 # standardvalue
for i,lst in enumerate(stream):
if i == 0:
# look for desired column
for j,col in enumerate(lst):
if col == content_collumn_name:
content_collumn = j
else:
yield lst[content_collumn]
def csv_to_metaStream(path2csv: str, metalist: [str]):
"""
:param path2csv: string
:param metalist: list of strings
:return: dict-generator
"""
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
content_collumn = 0 # standardvalue
metaindices = []
metadata_temp = {}
for i,lst in enumerate(stream):
if i == 0:
for j,col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
for key in metalist:
if key == col:
metaindices.append(j)
metadata_temp = dict(zip(metalist,metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
else:
metadata = metadata_temp.copy()
for key,value in metadata.items():
metadata[key] = lst[value]
yield metadata
############################################ Preprocessing ##############################################
############# on str-gen
def processTokens(tokens, funclist, parser):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
# idee: funclist sortieren,s.d. erst alle string-methoden ausgeführt werden, dann wird geparesed, dann wird auf tokens gearbeitet, dann evtl. auf dem ganzen Doc
if 'bool' in str(f.__annotations__):
tokens = list(filter(f, tokens))
elif 'str' in str(f.__annotations__):
tokens = list(map(f, tokens)) # purer text
doc = parser(" ".join(tokens)) # neu parsen
tokens = [tok for tok in doc] # nur tokens
elif 'spacy.tokens.doc.Doc' in str(f.__annotations__):
#todo wirkt gefrickelt
doc = parser(" ".join(tok.lower_ for tok in tokens)) # geparsed
tokens = f(doc)
doc = parser(" ".join(tokens)) # geparsed
tokens = [tok for tok in doc] # nur tokens
else:
warnings.warn("Unknown Annotation while preprocessing. Function: {0}".format(str(f)))
return tokens
def processTextstream(textstream, funclist, parser=DE_PARSER):
"""
:param textstream: string-gen
:param funclist: [func]
:param parser: spacy-parser
:return: string-gen
"""
# input:str-stream output:str-stream
pipe = parser.pipe(textstream)
for doc in pipe:
tokens = []
for tok in doc:
tokens.append(tok)
tokens = processTokens(tokens,funclist,parser)
yield " ".join([tok.lower_ for tok in tokens])
def processDictstream(dictstream, funcdict, parser=DE_PARSER):
"""
:param dictstream: dict-gen
:param funcdict:
clean_in_meta = {
"Solution":funclist,
...
}
:param parser: spacy-parser
:return: dict-gen
"""
for dic in dictstream:
result = {}
for key, value in dic.items():
if key in funcdict:
doc = parser(value)
tokens = [tok for tok in doc]
funclist = funcdict[key]
tokens = processTokens(tokens,funclist,parser)
result[key] = " ".join([tok.lower_ for tok in tokens])
else:
result[key] = value
yield result
############# return bool
def keepPOS(pos_list) -> bool:
ret = lambda tok : tok.pos_ in pos_list
ret.__annotations__ = get_calling_function().__annotations__
return ret
def removePOS(pos_list)-> bool:
ret = lambda tok : tok.pos_ not in pos_list
ret.__annotations__ = get_calling_function().__annotations__
return ret
def removeWords(words, keep=None)-> bool:
if hasattr(keep, '__iter__'):
for k in keep:
try:
words.remove(k)
except ValueError:
pass
ret = lambda tok : tok.lower_ not in words
ret.__annotations__ = get_calling_function().__annotations__
return ret
def keepENT(ent_list) -> bool:
ret = lambda tok : tok.ent_type_ in ent_list
ret.__annotations__ = get_calling_function().__annotations__
return ret
def removeENT(ent_list) -> bool:
ret = lambda tok: tok.ent_type_ not in ent_list
ret.__annotations__ = get_calling_function().__annotations__
return ret
def remove_words_containing_Numbers() -> bool:
ret = lambda tok: not bool(re.search('\d', tok.lower_))
ret.__annotations__ = get_calling_function().__annotations__
return ret
def remove_words_containing_specialCharacters() -> bool:
ret = lambda tok: not bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', tok.lower_))
ret.__annotations__ = get_calling_function().__annotations__
return ret
def remove_words_containing_topLVL() -> bool:
ret = lambda tok: not bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', tok.lower_))
ret.__annotations__ = get_calling_function().__annotations__
return ret
def lemmatizeWord(word,filepath=LEMMAS):
"""http://www.lexiconista.com/datasets/lemmatization/"""
for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
if word.lower() == line.split()[1].strip().lower():
return line.split()[0].strip().lower()
return word.lower() # falls nix gefunden wurde
def lemmatize() -> str:
ret = lambda tok: lemmatizeWord(tok.lower_)
ret.__annotations__ = get_calling_function().__annotations__
return ret
############# return strings
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
def replaceEmails(replace_with="EMAIL") -> str:
ret = lambda tok : emailFinder.sub(replace_with, tok.lower_)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def replaceURLs(replace_with="URL") -> str:
ret = lambda tok: textacy.preprocess.replace_urls(tok.lower_,replace_with=replace_with)
#ret = lambda tok: urlFinder.sub(replace_with,tok.lower_)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def replaceSpecialChars(replace_with=" ") -> str:
ret = lambda tok: specialFinder.sub(replace_with,tok.lower_)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def replaceTwitterMentions(replace_with="TWITTER_MENTION") -> str:
ret = lambda tok : mentionFinder.sub(replace_with,tok.lower_)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def replaceNumbers(replace_with="NUMBER") -> str:
ret = lambda tok: textacy.preprocess.replace_numbers(tok.lower_, replace_with=replace_with)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def replacePhonenumbers(replace_with="PHONENUMBER") -> str:
ret = lambda tok: textacy.preprocess.replace_phone_numbers(tok.lower_, replace_with=replace_with)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def replaceHardS(replace_with="ss") -> str:
ret = lambda tok: hardSFinder.sub(replace_with,tok.lower_)
ret.__annotations__ = get_calling_function().__annotations__
return ret
def fixUnicode() -> str:
ret = lambda tok: textacy.preprocess.fix_bad_unicode(tok.lower_, normalization=u'NFC')
ret.__annotations__ = get_calling_function().__annotations__
return ret
def resolveAbbreviations():
pass #todo
#todo wörter mit len < 2 entfernen( vorher abkürzungen (v.a. tu und fh) auflösen) und > 35 oder 50 ("Reiserücktrittskostenversicherung)
############# return docs
def keepUniqeTokens() -> spacy.tokens.Doc:
ret = lambda doc: (set([tok.lower_ for tok in doc]))
ret.__annotations__ = get_calling_function().__annotations__
return ret
def lower() -> spacy.tokens.Doc:
ret = lambda doc: ([tok.lower_ for tok in doc])
ret.__annotations__ = get_calling_function().__annotations__
return ret
################################################################################################################

View File

@ -11,6 +11,9 @@ from scipy import *
import os
from preprocessing import removePOS
from preprocessing import filterTokens
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
@ -24,11 +27,6 @@ with open(config_ini) as f:
config.read_file(f)
global REGEX_SPECIALCHAR
global WORDS
REGEX_SPECIALCHAR = r'[`\=~%^&*()_+\[\]{};\'"|</>]' #+r',.-\\:' #+r',.?!'
WORDS= {}
@ -113,15 +111,12 @@ def clean(stringstream,autocorrect=False):
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# frage autocorrect?
#idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf
#frage autocorrect? idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf
if autocorrect:
string = " ".join([autocorrectWord(word) for word in string.split()])
yield string
def processDictstream(dictstream, funcdict, parser):
"""
@ -154,30 +149,21 @@ def processDictstream(dictstream, funcdict, parser):
result[key] = value
yield result
def filterTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
tokens = list(filter(f, tokens))
return tokens
def removePOS(pos_list):
return lambda tok: tok.pos_ not in pos_list
##################################################################################################
ressources_path = FILEPATH + "ressources/"
path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")
path2wordsdict = ressources_path + config.get("spellchecking", "pickle_file")
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
autocorrect = config.getboolean("preprocessing", "autocorrect")
def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
def cleanCorpus(corpus_path, clean_in_meta, lang="de", printrandom=10,autocorrect=False):
logprint("Clean {0}_corpus at {1}".format(lang, datetime.now()))
@ -192,7 +178,7 @@ def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrando
## process and add files to textacy-corpi,
clean_corpus.add_texts(
clean(corpus2Text(raw_corpus)),
clean(corpus2Text(raw_corpus),autocorrect=autocorrect),
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
)
@ -220,8 +206,6 @@ def main():
WORDS = load_obj(path2wordsdict)
clean_in_content = [] #frage notwendig?
clean_in_meta = {
"Solution": [removePOS(["SPACE"])],
@ -229,7 +213,7 @@ def main():
"categoryName": [removePOS(["SPACE", "PUNCT"])]
}
corpus = cleanCorpus(corpus_de_path, clean_in_content, clean_in_meta, "de",printrandom=5 )
corpus = cleanCorpus(corpus_de_path, clean_in_meta, "de",printrandom=5, autocorrect=autocorrect )
end = time.time()
logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))

View File

@ -1,24 +0,0 @@
Index: 0
Text: lieber support, ich habe gerade versucht mich mit meiner unicard im firefox browser fuer das service portal zu authentifizieren. das hat vor einigen wochen noch tadelos geklappt und mittlerweile bekomme ich folgende fehlermeldung ich hoffe sie koennen mir weiterhelfen. vielen dank und viele gruesse sascha feldhorst dipl. inform. sascha feldhorst wiss. ang. technische universitaet dortmund maschinenbau lehrstuhl fuer foerder und lagerwesen logistikcampus joseph von fraunhofer str. 2 4 d 44227 dortmund tel. 49 231 755 40 73 fax 49 231 755 47 68 mailto sascha.feldhorst@tu dortmund.de sascha.feldhorst@tu dortmund.de http www.flw.mb.tu dortmund.de www.flw.mb.tu dortmund.de wichtiger hinweis die information in dieser e mail ist vertraulich. sie ist ausschliesslich fuer den adressaten bestimmt. sollten sie nicht der fuer diese e mail bestimmte adressat sein, unterrichten sie bitte den absender und vernichten sie diese mail. vielen dank. unbeschadet der korrespondenz per e mail, sind unsere erklaerungen ausschliesslich final rechtsverbindlich, wenn sie in herkoemmlicher schriftform mit eigenhaendiger unterschrift oder durch uebermittlung eines solchen schriftstuecks per telefax erfolgen. important note the information included in this e mail is confidential. it is solely intended for the recipient. if you are not the intended recipient of this e mail please contact the sender and delete this message. thank you. without prejudice of e mail correspondence, our statements are only legally binding when they are made in the conventional written form with personal signature or when such documents are sent by fax.
categoryName: betrieb
Index: 0
Text: support browser service portal mittlerweile
categoryName: betrieb
Index: 1
Text: telefon umzug antragsteller astrid gramm astrid.gramm@tu dortmund.de terminvorschlag 14.08.2015 einrichtung dezernat 2 abteilung 2.5 psp element uniaccount mnichofm hofmann, nicole gebaeude dezernat 5 raum id 201 651430 telefondose neztwerkdose dt04.5 04.6 telefonnr. 4821 eintrag telefonbuch e mail astrid.gramm@tu dortmund.de voicemail ansprechpartner astrid gramm tel. ansprechpartner 5444 verantwortlicher nutzer type bemerkung frau hofmann wird am 14.08.2015 in die wd 2 umziehen. es ist der raum 201a im og nicht 201 eine bezeichnung der telefondose ist nicht vorhanden.
categoryName: elektronisches telefonbuch
Index: 1
Text: telefon umzug antragsteller gramm einrichtung dezernat abteilung element gebaeude dezernat raum id eintrag telefonbuch mail ansprechpartner gramm ansprechpartner verantwortlicher nutzer type bemerkung raum bezeichnung
categoryName: elektronisches telefonbuch

View File

@ -37,12 +37,12 @@ pickle_file=en_stopwords_list.pkl
[logging]
level=INFO
filename=topicModelTickets.log
filename=log/topicModelTickets.log
[de_corpus]
input=M42-Export/Tickets_small.csv
#input=M42-Export/de_tickets.csv
#input=M42-Export/Tickets_small.csv
input=M42-Export/de_tickets.csv
path=corpi/
@ -64,7 +64,10 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
#ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
custom_words=eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok
autocorrect = false
#true
custom_words=aenderung,hahn,verantwortlicher,rolle,status,fehlgeschlagen,aenderung,test,erwuenscht,antragsteller,bemerkung,tu,uni,prof,bezeichnung,gramm,type,eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok,januar,februar,maerz,april,mai,juni,juli,august,september,oktober,november,dezember

View File

@ -23,8 +23,6 @@ with open(config_ini) as f:
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
"""
:param path2csv: string
@ -75,27 +73,9 @@ def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
##################################################################################################
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
"""
content_collumn_name = "Description"
metaliste = [
"TicketNumber",
"Subject",
"CreatedDate",
"categoryName",
"Impact",
"Urgency",
"BenutzerID",
"VerantwortlicherID",
"EigentuemerID",
"Solution"
]
"""
content_collumn_name = config.get("tickets","content_collumn_name")
metaliste = list(map(normalize_whitespace,config.get("tickets","metaliste").split(",")))
metaliste = get_list_from_config("tickets","metaliste")
path2de_csv = FILEPATH + config.get("de_corpus","input")
@ -110,7 +90,6 @@ corpus_en_path = FILEPATH + config.get("en_corpus", "path")
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, lang, printrandom=0):
# print paths
path_csv_split = path2_csv.split("/")
filename = path_csv_split[len(path_csv_split) - 1]
@ -121,8 +100,6 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
raw_corpus = textacy.Corpus(lang)
## add files to textacy-corpi,
#printlog("Add texts to {0}_textacy-corpi".format(lang))
raw_corpus.add_texts(
ticketcsv_to_textStream(path2_csv, content_collumn_name),
ticket_csv_to_DictStream(path2_csv, metaliste)
@ -132,6 +109,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
# leere docs aus corpi kicken
raw_corpus.remove(lambda doc: len(doc) == 0)
logprint("corpus-lenght: {}".format(len(raw_corpus)))
#random Doc printen
for i in range(printrandom):
printRandomDoc(raw_corpus)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

33
init.py
View File

@ -237,36 +237,37 @@ def build_words_for_spellchecking(path2words):
##################################################################################################
# THESAURUS
path2wordnet = FILEPATH + config.get("thesaurus","input")
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
ressources_path = FILEPATH + "ressources/"
path2wordnet = ressources_path + config.get("thesaurus","input")
path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file")
# SPELLCHECKING
path2words_file = FILEPATH + config.get("spellchecking","input")
path2wordlist = FILEPATH + config.get("spellchecking","pickle_file")
path2words_file = ressources_path + config.get("spellchecking","input")
path2wordlist = ressources_path + config.get("spellchecking","pickle_file")
# LEMMA
path2lemma_file = FILEPATH + config.get("lemmatization","input")
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
path2lemma_file = ressources_path + config.get("lemmatization","input")
path2lemmadict = ressources_path + config.get("lemmatization","pickle_file")
# NOMEN
nouns1 = FILEPATH + config.get("nouns","input1")
nouns2 = FILEPATH + config.get("nouns","input2")
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
nouns1 = ressources_path + config.get("nouns","input1")
nouns2 = ressources_path + config.get("nouns","input2")
path2nouns_list = ressources_path + config.get("nouns","pickle_file")
# VORNAMEN
firstnames_txt = FILEPATH + config.get("firstnames","input")
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
firstnames_txt = ressources_path + config.get("firstnames","input")
path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
# STOPWORDS
stop1 = FILEPATH + config.get("de_stopwords","input1")
stop2 = FILEPATH + config.get("de_stopwords","input2")
stop3 = FILEPATH + config.get("de_stopwords","input3")
path2stopwordlist_de = FILEPATH + config.get("de_stopwords","pickle_file")
stop1 = ressources_path + config.get("de_stopwords","input1")
stop2 = ressources_path + config.get("de_stopwords","input2")
stop3 = ressources_path + config.get("de_stopwords","input3")
path2stopwordlist_de = ressources_path + config.get("de_stopwords","pickle_file")
path2stopwordlist_en = FILEPATH + config.get("en_stopwords","pickle_file")
path2stopwordlist_en = ressources_path + config.get("en_stopwords","pickle_file")

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

21
main.py
View File

@ -11,12 +11,12 @@ import cleaning
from miscellaneous import *
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/printout_main.log &"
start = time.time()
#init.main()
init.main()
logprint("")
corporization.main()
@ -30,32 +30,23 @@ logprint("")
"""
topicModeling.main(use_raw=False,algorithm="lsa")
#topicModeling.main(use_cleaned=False,algorithm="lsa")
logprint("")
topicModeling.main(use_raw=False,algorithm="lda")
#topicModeling.main(use_cleaned=False,algorithm="nmf")
logprint("")
topicModeling.main(use_raw=False,algorithm="nmf")
#topicModeling.main(use_cleaned=False,algorithm="lda")
logprint("")
topicModeling.main(use_raw=False,algorithm="llda")
topicModeling.main(use_cleaned=False,algorithm="llda")
logprint("")
"""
logprint("")
end = time.time()
logprint("Total Time Elapsed: {0} min".format((end - start) / 60))

View File

@ -153,6 +153,25 @@ def printRandomDoc(textacyCorpus):
print()
def get_list_from_config(section,option):
return list(map(textacy.preprocess.normalize_whitespace,config.get(section,option).split(",")))
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
def savelabledCorpiLines(corpus,filepath):
textacy.fileio.write_file_lines(gen_labledLines(corpus), filepath=filepath)
def gen_labledLines(corpus):
for doc in corpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + doc.metadata["categoryName"] + "] " + doc.text
def save_corpus(corpus, corpus_path, corpus_name):
@ -219,95 +238,6 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
for key,value in plain.items():
if key != "content" and key != "index":
meta[key] = value
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
return corpus, corpus.spacy_lang
"""
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
def saveplaincorpustext(corpus,path):
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
def save_corpusV2(corpus, corpus_path, corpus_name):
# save parser
parser = corpus.spacy_lang
parserpath = corpus_path + str(parser.lang) + '_parser'
parser.save_to_directory(parserpath)
contentpath = corpus_path +corpus_name + "_docs/"
if not os.path.exists(contentpath):
os.makedirs(contentpath)
for doc in corpus:
with open(contentpath + str(doc.corpus_index) + "_doc.bin", 'w') as f:
f.write(doc.spacy_doc.to_bytes())
with open(contentpath + str(doc.corpus_index) + "_meta.json", 'w') as file:
file.write(json.dumps(doc.metadata))
def load_corpusV2(corpus_path, corpus_name, lang="de"):
# ckeck for language
if "de_" in corpus_name:
lang = "de"
elif "en_" in corpus_name:
lang = "en"
# load parser
parser = spacy.load(lang)
stringstorepath = corpus_path + str(lang) + '_parser' + '/vocab/strings.json'
with open(stringstorepath) as file:
parser.vocab.strings.load(file)
vocabpath = Path(corpus_path + str(lang) + '_parser' + '/vocab/lexemes.bin')
parser.vocab.load_lexemes(vocabpath)
# load corpus
corpus = textacy.Corpus(parser)
contentpath = corpus_path + corpus_name + "_docs/"
docs = yield_fromdir(contentpath,spacy_vocab=corpus.spacy_vocab,type="doc")
metas = yield_fromdir(contentpath,type="meta")
for doc,meta in zip(docs,metas):
corpus.add_doc(
textacy.Doc(doc, lang=corpus.spacy_lang, metadata=meta))
return corpus, corpus.spacy_lang
def yield_fromdir(path,spacy_vocab=None,type=".pkl"):
os.chdir(path)
filelist = [name for name in os.listdir('.') if os.path.isfile(name)]
filelist = [filename for filename in filelist if type in filename]
filelist.sort(key = lambda elem : elem.split("_")[0])
if type =='doc':
for filename in filelist:
with open(path+filename,'r') as f:
for bytes_string in SpacyDoc.read_bytes(f):
yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
elif type == 'meta':
for filename in filelist:
with open(path+filename,'r') as f:
yield json.load(f)
else:
for filename in filelist:
yield load_obj(path+filename)
"""
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
return corpus, corpus.spacy_lang

View File

@ -1,466 +0,0 @@
# -*- coding: utf-8 -*-
import csv
import random
import sys
import spacy
import textacy
"""
import keras
import numpy as np
from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout
from keras.models import Sequential
import keras.backend as K
"""
csv.field_size_limit(sys.maxsize)
"""
def getFirstSynonym(word, thesaurus_gen):
word = word.lower()
# TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
# durch den thesaurrus iterieren
for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
# durch den synonymblock iterieren
for syn in syn_block:
syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)
# falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
if word in syn:
# Hauptform suchen
if "auptform" in syn:
# nicht ausgeben, falls es in Klammern steht
for w in syn:
if not re.match(r'\([^)]+\)', w) and w is not None:
return w
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
if len(syn) == 1:
w = syn[0]
if not re.match(r'\([^)]+\)', w) and w is not None:
return w
return word # zur Not die eingabe ausgeben
"""
"""
def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False):
# use preprocessing
if customPreprocessing is not None:
string = customPreprocessing(string)
if custom_stopwords is not None:
custom_stopwords = custom_stopwords
else:
custom_stopwords = []
if custom_words is not None:
custom_words = custom_words
else:
custom_words = []
if custom_symbols is not None:
custom_symbols = custom_symbols
else:
custom_symbols = []
# custom stoplist
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS
stoplist =list(stop_words) + custom_stopwords
# List of symbols we don't care about either
symbols = ["-----","---","...","","",".","-","<",">",",","?","!","..","nt","n't","|","||",";",":","","s","'s",".","(",")","[","]","#"] + custom_symbols
# get rid of newlines
string = string.strip().replace("\n", " ").replace("\r", " ")
# replace twitter
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
string = mentionFinder.sub("MENTION", string)
# replace emails
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
string = emailFinder.sub("EMAIL", string)
# replace urls
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
string = urlFinder.sub("URL", string)
# replace HTML symbols
string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
# parse with spaCy
spacy_doc = PARSER(string)
tokens = []
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ in added_POS:
if lemmatize:
tokens.append(tok.lemma_.lower().strip())
else:
tokens.append(tok.text.lower().strip())
# add entities
if tok.ent_type_ in added_entities:
tokens.append(tok.text.lower())
# remove stopwords
tokens = [tok for tok in tokens if tok not in stoplist]
# remove symbols
tokens = [tok for tok in tokens if tok not in symbols]
# remove custom_words
tokens = [tok for tok in tokens if tok not in custom_words]
# remove single characters
tokens = [tok for tok in tokens if len(tok)>1]
# remove large strings of whitespace
remove_large_strings_of_whitespace(" ".join(tokens))
#idee abkürzungen auflösen (v.a. TU -> Technische Universität)
if normalize_synonyms:
tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
return " ".join(tokens)
def remove_large_strings_of_whitespace(sentence):
whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE)
sentence = whitespaceFinder.sub(" ", sentence)
tokenlist = sentence.split(" ")
while "" in tokenlist:
tokenlist.remove("")
while " " in tokenlist:
tokenlist.remove(" ")
return " ".join(tokenlist)
"""
"""
def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
metadata = {}
text = "ERROR"
for field in ticket:
if field.tag == textfield:
if clean:
text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
else:
text = field.text
else:
#idee hier auch cleanen?
metadata[field.tag] = field.text
yield text, metadata
"""
LANGUAGE = 'de'
#PARSER = de_core_news_md.load()
PARSER = spacy.load(LANGUAGE)
from old.textCleaning import TextCleaner
cleaner = TextCleaner(parser=PARSER)
def generateTextfromTicketXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False, lemmatize=False):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
text = "ERROR"
for field in ticket:
if field.tag == textfield:
if clean:
text = cleaner.normalizeSynonyms(cleaner.removeWords(cleaner.keepPOSandENT(field.text))) #,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
else:
text = field.text
yield text
def generateMetadatafromTicketXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
metadata = {}
for field in ticket:
if field.tag != textfield:
if field.tag == "Zusammenfassung":
metadata[field.tag] = cleaner.removePunctuation(field.text)
elif field.tag == "Loesung":
metadata[field.tag] = cleaner.removeWhitespace(field.text)
else:
metadata[field.tag] = field.text
yield metadata
"""
def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None):
if custom_symbols is not None:
custom_symbols = custom_symbols
else:
custom_symbols = []
if keep is not None:
keep = keep
else:
keep = []
# List of symbols we don't care about
symbols = ["-----","---","...","","",".","-","<",">",",","?","!","..","nt","n't","|","||",";",":","","s","'s",".","(",")","[","]","#"] + custom_symbols
# parse with spaCy
spacy_doc = parser(string)
tokens = []
pos = ["NUM", "SPACE", "PUNCT"]
for p in keep:
pos.remove(p)
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ not in pos and tok.text not in symbols:
tokens.append(tok.text)
return " ".join(tokens)
def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False):
# use preprocessing
if customPreprocessing is not None:
string = customPreprocessing(string)
if custom_stopwords is not None:
custom_stopwords = custom_stopwords
else:
custom_stopwords = []
if custom_words is not None:
custom_words = custom_words
else:
custom_words = []
# custom stoplist
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS
stoplist =list(stop_words) + custom_stopwords
# replace twitter
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
string = mentionFinder.sub("MENTION", string)
# replace emails
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
string = emailFinder.sub("EMAIL", string)
# replace urls
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
string = urlFinder.sub("URL", string)
# replace HTML symbols
string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
# parse with spaCy
spacy_doc = parser(string)
tokens = []
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ in added_POS:
if lemmatize:
tokens.append(tok.lemma_.lower().strip())
else:
tokens.append(tok.text.lower().strip())
# add entities
if tok.ent_type_ in added_entities:
tokens.append(tok.text.lower())
# remove stopwords
tokens = [tok for tok in tokens if tok not in stoplist]
# remove custom_words
tokens = [tok for tok in tokens if tok not in custom_words]
# remove single characters
tokens = [tok for tok in tokens if len(tok)>1]
# remove large strings of whitespace
#remove_whitespace(" ".join(tokens))
#idee abkürzungen auflösen (v.a. TU -> Technische Universität): abkürzungsverezeichnis
if normalize_synonyms:
tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
return " ".join(set(tokens))
def cleanText_removeWhitespace(sentence):
whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
sentence = whitespaceFinder.sub(" ", sentence)
return sentence
#todo: preprocess pipe: removewhitespace, removePUNCT, resolveAbk, keepPOS, keepEnt, removeWords, normalizeSynonyms
def getFirstSynonym(word, thesaurus_gen):
word = word.lower()
# durch den thesaurrus iterieren
for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
for syn in syn_block:
syn = syn.lower()
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
if word == syn:
return getHauptform(syn_block, word)
else: # falls es ein satz ist
if word in syn:
return getHauptform(syn_block, word)
return word # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform(syn_block, word, default_return_first_Syn=False):
for syn in syn_block:
syn = syn.lower()
if "hauptform" in syn and len(syn.split(" ")) <= 2:
# nicht ausgeben, falls es in Klammern steht
for w in syn.split(" "):
if not re.match(r'\([^)]+\)', w):
return w
if default_return_first_Syn:
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
for w in syn_block:
if not re.match(r'\([^)]+\)', w):
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
"""
def printRandomDoc(textacyCorpus):
print()
print("len(textacyCorpus) = %i" % len(textacyCorpus))
randIndex = int((len(textacyCorpus) - 1) * random.random())
print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
print()
####################'####################'####################'####################'####################'##############
# todo config-file
DATAPATH = "ticketSamples.xml"
DATAPATH_thesaurus = "openthesaurus.csv"
normalize_Synonyms = True
clean = True
lemmatize = True
custom_words = ["grüßen", "fragen"]
####################'####################'####################'####################'####################'##############
## files to textacy-corpi
textacyCorpus = textacy.Corpus(PARSER)
print("add texts to textacy-corpi...")
textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH))
#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize):
# textacyCorpus.add_text(txt,dic)
for doc in textacyCorpus:
print(doc.metadata)
print(doc.text)
#print(textacyCorpus[2].text)
#printRandomDoc(textacyCorpus)
#print(textacyCorpus[len(textacyCorpus)-1].text)
print()
print()

View File

@ -1,213 +0,0 @@
# -*- coding: utf-8 -*-
import spacy
import textacy
from spacy.tokens import Doc
# -*- coding: utf-8 -*-
import re
import spacy
import functools
import textacy
class TextCleaner:
def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
"""
:param parser: spacy-parser
:param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
:param customClass_symbols:[str]
:param customClass_words:[str]
:param customClassPOS:[str]
:param keep4All: [str]
"""
if thesaurus is None:
DATAPATH_thesaurus = "openthesaurus.csv"
## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt
self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))
else:
self.thesaurus = thesaurus
self.parser = parser
#self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
# to keep
self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
"""
# to remove
self.symbols = ["-----", "---", "...", "", "", ".", "-", "<", ">", ",", "?", "!", "..", "nt", "n't", "|", "||",
";", ":",
"", "s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else [])
self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else [])
self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])
keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep
# modify those to remove with those to keep
for sym in keep:
try:
self.symbols.remove(sym)
except ValueError:
pass
for sym in keep:
try:
self.stop_words.remove(sym)
except ValueError:
pass
"""
def loadString(self,string):
self.currentDoc = self.parser(string)
def removeWhitespace(self, string):
return " ".join([tok.text for tok in self.currentDoc if not tok.is_space])
def removePunctuation(self, string, custom_symbols=None, keep=None):
symbols = self.symbols + (custom_symbols if custom_symbols is not None else [])
if hasattr(keep, '__iter__'):
for k in keep:
try:
symbols.remove(k)
except ValueError:
pass
return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols])
def cleanDoc(doc, toDelete=None, toKeep=None):
"""
:param doc: spacyDoc
:param toDelete: [str] pos_ , ent_type_ or tag_
:return: str tokenlist
"""
#keep
tokenlist = []
for tok in doc:
if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep:
tokenlist.append(tok.text)
#delete
tokenlist = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete]
result = " ".join(tokenlist)
return result #problem: kein doc und daher nicht komponierbar
def keepinDoc(doc, toKeep=None):
"""
:param doc: spacyDoc
:param toDelete: [str]
:return: str tokenlist
"""
return " ".join([tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep])
# https://mathieularose.com/function-composition-in-python/
parser = spacy.load('de')
cleaner = TextCleaner(parser)
corpus_raw = textacy.Corpus(parser)
corpus_clean = textacy.Corpus(parser)
def foo(doc, toKeep=None):
words = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]
spaces = [True] * len(words)
return Doc(doc.vocab,words=words,spaces=spaces)
def foo2(doc, toDelete=None):#, toKeep=None):
"""
:param doc: spacyDoc
:param toDelete: [str] pos_ , ent_type_ or tag_
:return: str tokenlist
"""
#keep
#tokenlist = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]
#delete
words = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete]
spaces = [True] * len(words)
return Doc(doc.vocab, words=words, spaces=spaces)
"""
def compose(self,*functions):
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
def composeo(*functions):
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions)
"""
def double(a):
return a*2
def add(a, b):
return a+b
def compose(*functions):
def compose2(f, g):
return lambda x: f(g(x))
return functools.reduce(compose2, functions, lambda x: x)
#pipeline = compose(removeFromDoc, cleaner.removeWhitespace, cleaner.loadString)
"""
def pipe1(string):
cleaner.loadString(string)
string = cleaner.removeWhitespace(string)
string = cleaner.removePunctuation(string)
return string
"""
def cleaningPipe(spacy_pipe, composition):
for doc in spacy_pipe:
yield composition(doc)
pipeline = compose(
functools.partial(foo2, toDelete=["PUNCT", "SPACE"]),
functools.partial(foo, toKeep=["NOUN"]))
string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
doc = parser(string)
#print(removeFromDoc(doc,toDelete=["PUNCT"]))
print(pipeline(doc.text))
for txt in cleaningPipe(parser.pipe([string]),pipeline):
print(txt)
"""
corpus_raw.add_text(string)
for doc in parser.pipe([string]):
doc.text = removeFromDoc(doc, toDelete=["PUNCT"])
"""
#corpus_clean.add_texts(cleaningPipe(parser.pipe([string]),pipeline))
#print(corpus_raw[0].text)

View File

@ -1,199 +0,0 @@
# -*- coding: utf-8 -*-
import functools
import re
import spacy
import textacy
from spacy.tokens import Doc
from spacy.tagger import Tagger
import xml.etree.ElementTree as ET
PARSER = spacy.load('de')
stop_words = list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
def compose(*functions):
def compose2(f, g):
return lambda x: f(g(x))
return functools.reduce(compose2, functions, lambda x: x)
def cleanTexts(textstream, parser, attr):
#input str-stream output str-stream
pipe = parser.pipe(textstream)
for doc in pipe:
tokens = [tok.text for tok in doc
if tok.pos_ not in attr
and tok.tag_ not in attr
and tok.ent_ not in attr
and tok.text not in attr
and tok.lower_ not in attr]
yield " ".join(tokens)
"""
def cleanDoc_lemmatize(doc,parser=PARSER):
return parser(" ".join([tok.lemma_ for tok in doc ]))
def cleanDoc_STOPS(doc,parser=PARSER, stop_words=None, keep=None):
if stop_words is None:
stop_words = list(__import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS)
if hasattr(keep, '__iter__'):
for k in keep:
try:
stop_words.remove(k)
except ValueError:
pass
return parser(" ".join([tok.text for tok in doc if tok.text not in stop_words]))
def cleanDoc_ENT(doc,parser=PARSER, keeponly=False, attr=["WORK_OF_ART", "ORG", "PRODUCT", "LOC"]):
if keeponly:
return parser(" ".join([tok.text for tok in doc if tok.ent_ in attr]))
else:
return parser(" ".join([tok.text for tok in doc if tok.ent_ not in attr]))
def cleanDoc_POS(doc,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
if keeponly:
return parser(" ".join([tok.text for tok in doc if tok.pos_ in attr]))
else:
return parser(" ".join([tok.text for tok in doc if tok.pos_ not in attr]))
"""
def cleanTexts_POS(spacypipe, keeponly=False, attr=["SPACE", "PUNCT"]):
"""
:param spacypipe: spacypipe
:param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
:param attr: [str] pos_ or ent_type_
:yields: stream of strings: full-length cleaned text
"""
if keeponly:
for doc in spacypipe:
yield " ".join([tok.text for tok in doc if tok.pos_ in attr])
else:
for doc in spacypipe:
yield " ".join([tok.text for tok in doc if tok.pos_ not in attr])
def cleanText_POS(text,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
"""
:param txt: str
:param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
:param attr: [str] pos_ or ent_type_
:return: str
"""
doc = parser(text)
if keeponly:
return " ".join([tok.text for tok in doc if tok.pos_ in attr])
else:
return " ".join([tok.text for tok in doc if tok.pos_ not in attr])
def removeWhitespace(string):
return re.sub(r'(\r\n|\r|\n|(\s)+)', ' ', string)
def removeWords(string, words):
big_regex = re.compile('|'.join(map(re.escape, words)))
return big_regex.sub("", string)
def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung', cleaning_function=None):
"""
generates strings from XML
:param path2xml:
:param main_textfield:
:param cleaning_function:
:yields strings
"""
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
text = "ERROR"
for field in ticket:
if field.tag == main_textfield:
if cleaning_function:
text = cleaning_function(field.text)
else:
text = field.text
yield text
def generateMetadatafromTicketXML(path2xml, key_function_pairs_to_clean, leave_out=['Beschreibung']):
import xml.etree.ElementTree as ET
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for ticket in root:
metadata = {}
for field in ticket:
if field.tag not in leave_out:
if field.tag in key_function_pairs_to_clean:
metadata[field.tag] = key_function_pairs_to_clean[field.tag](field.text)
else:
metadata[field.tag] = field.text
yield metadata
string = "Frau Hinrichs überdenkt die tu Situation a@bc.de und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
#print(removeWords(string,["die", "neue"]))
# in:str out:str
cleanString = compose(
cleanText_POS,
functools.partial(textacy.preprocess.replace_emails, replace_with=u'EMAIL')
)
key_function_pairs_to_clean = {
"Loesung":removeWhitespace,
"Zusammenfassung":cleanText_POS
}
"""
# in:str-gen out:str-gen
cleanStream = compose(
removeSTOP,
lemmatize,
cleanEnt
)
"""
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpi
# metadata:xml -> -> stringCleaning -> corpi
corpus = textacy.Corpus(PARSER)
corpus.add_texts(
cleanTexts(generateMainTextfromTicketXML("ticketSamples.xml"),PARSER,["PUNCT","SPACE","PERSON"])#,
#generateMetadatafromTicketXML("ticketSamples.xml",key_function_pairs_to_clean=key_function_pairs_to_clean)
)
print(corpus[0].text)

View File

@ -1,263 +0,0 @@
# -*- coding: utf-8 -*-
import re
import spacy
import functools
import textacy
class TextCleaner:
def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
"""
:param parser: spacy-parser
:param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
:param customClass_symbols:[str]
:param customClass_words:[str]
:param customClassPOS:[str]
:param keep4All: [str]
"""
if thesaurus is None:
DATAPATH_thesaurus = "openthesaurus.csv"
## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt
self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))
else:
self.thesaurus = thesaurus
self.parser = parser
self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
# to remove
self.symbols = ["-----", "---", "...", "", "", ".", "-", "<", ">", ",", "?", "!", "..", "nt", "n't", "|", "||",
";", ":",
"", "s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else [])
self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else [])
# to keep
self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])
keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep
# modify those to remove with those to keep
for sym in keep:
try:
self.symbols.remove(sym)
except ValueError:
pass
for sym in keep:
try:
self.stop_words.remove(sym)
except ValueError:
pass
# idee self.currentDoc = spacy.Doc für jeden String aber nicht füpr jede methode
def loadString(self,string):
self.currentDoc = self.parser(string)
"""
def removeWhitespace(self, string):
string = self.whitespaceFinder.sub(" ", string)
return string
"""
def removeWhitespace(self, string):
return string
#self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
def removePunctuation(self, string, custom_symbols=None, keep=None):
symbols = self.symbols + (custom_symbols if custom_symbols is not None else [])
if hasattr(keep, '__iter__'):
for k in keep:
try:
symbols.remove(k)
except ValueError:
pass
# parse with spaCy
doc = self.parser(string)
tokens = []
# append Tokens to a list
for tok in doc:
if not tok.is_punct and not tok.is_space and tok.text not in symbols:
tokens.append(tok.text)
return " ".join(tokens)
def keepPOSandENT(self, string, customPOS=None, customEnt=None, remove=None):
pos2keep = self.pos2keep + (customPOS if customPOS is not None else [])
ent = self.entities2keep + (customEnt if customEnt is not None else [])
if hasattr(remove, '__iter__'):
for k in remove:
try:
ent.remove(k)
except ValueError:
try:
pos2keep.remove(k)
except ValueError:
pass
# parse with spaCy
spacy_doc = self.parser(string)
tokens = []
# append Tokens to a list
for tok in spacy_doc:
if tok.pos_ in pos2keep:
tokens.append(tok.text)
if tok.ent_type_ in ent:
tokens.append(tok.text)
return " ".join(set(tokens))
def resolveAbbreviations(self,string):
return string #todo
def removeWords(self,string, custom_words=None, keep=None, lemmatize=False):
wordlist = self.stop_words + (custom_words if custom_words is not None else [])
if hasattr(keep, '__iter__'):
for k in keep:
try:
wordlist.remove(k)
except ValueError:
pass
string = self.urlFinder.sub("URL", string)
string = self.emailFinder.sub("EMAIL", string)
string = self.mentionFinder.sub("MENTION", string)
string = string.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
# parse with spaCy
spacy_doc = self.parser(string)
tokens = []
# append Tokens to a list
for tok in spacy_doc:
#do not include stopwords/customwords and single chars
if tok.text not in wordlist and len(tok)>1:
if lemmatize:
tokens.append(tok.lemma_)
else:
tokens.append(tok.lower_)
return " ".join(set(tokens))
def normalizeSynonyms(self, string, default_return_first_Syn=False):
# parse with spaCy
spacy_doc = self.parser(string)
tokens = []
tokens = [str(self.getFirstSynonym(tok, self.thesaurus, default_return_first_Syn=default_return_first_Syn)) for tok in spacy_doc]
return " ".join(set(tokens))
def getFirstSynonym(self,word, thesaurus, default_return_first_Syn=False):
if not isinstance(word, str):
return word
word = word.lower()
# durch den thesaurrus iterieren
for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen
for syn in syn_block:
syn = syn.lower()
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
if word == syn:
return self.getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn)
else: # falls es ein satz ist
if word in syn:
return self.getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn)
return word # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform(self,syn_block, word, default_return_first_Syn=False):
for syn in syn_block:
syn = syn.lower()
if "hauptform" in syn and len(syn.split(" ")) <= 2:
# nicht ausgeben, falls es in Klammern steht
for w in syn.split(" "):
if not re.match(r'\([^)]+\)', w):
return w
if default_return_first_Syn:
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
for w in syn_block:
if not re.match(r'\([^)]+\)', w):
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
"""
#################################################################################################################
#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/
def compose(self,*functions):
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
pipeline = compose(functools.partial(cleaner.keepPOSandENT,lemmatize=True))#, cleaner.normalizeSynonyms)
#################################################################################################################
"""

File diff suppressed because it is too large Load Diff

View File

@ -24,16 +24,7 @@ with open(config_ini) as f:
config.read_file(f)
global REGEX_SPECIALCHAR
global REGEX_TOPLVL
global THESAURUS
global WORDS
global LEMMAS
global NOUNS
global VORNAMEN
global DE_STOP_WORDS
global EN_STOP_WORDS
REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|</>]' #+r',.'
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
@ -47,8 +38,20 @@ VORNAMEN= {}
DE_STOP_WORDS= {}
EN_STOP_WORDS= {}
############# filter tokens
def filterTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
tokens = list(filter(f, tokens))
return tokens
def keepPOS(pos_list):
return lambda tok: tok.pos_ in pos_list
@ -107,7 +110,7 @@ def remove_first_names():
############# strings
def remove_addresses(string):
pass # todo
pass # todo remove_addresses idee postal.parser und zu metadaten hinzufügen
def lemmatizeWord(word,lemma_dict=LEMMAS,n=3):
for i in range(n):
@ -183,55 +186,9 @@ def autocorrectWord(word):
############# stringcleaning
@deprecated
def stringcleaning(stringstream):
for string in stringstream:
string = string.lower()
# fixUnicode
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
# remove_words_containing_topLVL
string = " ".join([w.lower() for w in string.split() if not re.search(REGEX_TOPLVL, w)])
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# seperate_words_on_regex:
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
# cut_after
word = "gruss" #idee addressen enfernen --> postal.parser
string = string.rpartition(word)[0] if word in string else string
# lemmatize
string = " ".join([lemmatizeWord(word) for word in string.split()])
# synonyme normalisieren #idee vor oder nach lemmatize?
string = " ".join([getFirstSynonym(word) for word in string.split()])
# autocorrect
string = " ".join([autocorrectWord(word) for word in string.split()])
yield string
def filterTokens(tokens, funclist):
# in:tokenlist, funclist
# out: tokenlist
for f in funclist:
tokens = list(filter(f, tokens))
return tokens
def processContentstream2(textstream, parser, token_filterlist=None):
def processContentstream(textstream, parser, token_filterlist=None):
#pre parse
textstream = preparse(textstream)
@ -247,7 +204,7 @@ def processContentstream2(textstream, parser, token_filterlist=None):
tokens = filterTokens(tokens, token_filterlist)
# post parse
tokens = [postparse(tok) for tok in tokens] #todo informationsverlust von pos,tag etc.!
tokens = [postparse(tok) for tok in tokens] #todo: informationsverlust von pos,tag etc.!
yield " ".join(tokens)
@ -256,7 +213,6 @@ def preparse(stringstream):
for string in stringstream:
# cut_after
# todo addressen enfernen --> postal.parser idee zu metadaten hinzufügen
words = ["gruss", "grusse","gruesse","gruessen","grusses"]
for gr in words:
@ -287,39 +243,6 @@ def postparse(toktext):
return toktext
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
def corpus2Meta(corpus):
for doc in corpus:
yield doc.metadata
@deprecated
def processContentstream(textstream, parser, token_filterlist=None):
"""
:param textstream: string-gen
:param funclist: [func]
:param parser: spacy-parser
:return: string-gen
"""
# pre_parse
textstream = stringcleaning(textstream)
pipe = parser.pipe(textstream)
tokens = []
for doc in pipe:
tokens = [tok for tok in doc]
# in_parse
if token_filterlist is not None:
tokens = filterTokens(tokens, token_filterlist)
yield " ".join([tok.lower_ for tok in tokens])
# yield " ".join(list(set([tok.lower_ for tok in tokens])))
def processDictstream(dictstream, funcdict, parser):
"""
@ -356,30 +279,30 @@ def processDictstream(dictstream, funcdict, parser):
##################################################################################################
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
ressources_path = FILEPATH + "ressources/"
path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file")
path2wordsdict = ressources_path + config.get("spellchecking", "pickle_file")
path2lemmadict = ressources_path + config.get("lemmatization","pickle_file")
path2nouns_list = ressources_path + config.get("nouns","pickle_file")
path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file")
path2ENstopwordlist = ressources_path + config.get("en_stopwords", "pickle_file")
path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")
custom_words = get_list_from_config("preprocessing", "custom_words")
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
path2DEstopwordlist = FILEPATH + config.get("de_stopwords", "pickle_file")
path2ENstopwordlist = FILEPATH + config.get("en_stopwords", "pickle_file")
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
de_plainpath = FILEPATH + config.get("de_corpus", "path") + "pre_labled_lines.txt"
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
logprint("Preprocess {0}_corpus at {1}".format(lang, datetime.now()))
@ -387,8 +310,8 @@ def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", print
cleanCorpus_name = lang + "_clean_ticket"
preCorpus_name = lang + "_pre_ticket"
logprint("Load {0}_raw".format(lang))
#load raw corpus and create new one
logprint("Load {0}_raw".format(lang))
clean_corpus, parser = load_corpus(corpus_name=cleanCorpus_name, corpus_path=corpus_path)
corpus = textacy.Corpus(parser)
@ -396,7 +319,7 @@ def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", print
## process and add files to textacy-corpi,
corpus.add_texts(
processContentstream2(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser),
processContentstream(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser),
processDictstream(corpus2Meta(clean_corpus), clean_in_meta,parser=parser)
)
@ -409,22 +332,16 @@ def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", print
printRandomDoc(corpus)
#save corpus
save_corpus(corpus=corpus, corpus_path=corpus_path, corpus_name=preCorpus_name)
#save corpus as labled, plain text
plainpath = FILEPATH + config.get("de_corpus", "path") + "pre_labled_lines.txt"
textacy.fileio.write_file_lines(labledCorpiLines(corpus),filepath=plainpath )
savelabledCorpiLines(corpus, de_plainpath)
return corpus
def labledCorpiLines(corpus):
for doc in corpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + doc.metadata["categoryName"] + "] " + doc.text
def main():
start = time.time()
@ -438,10 +355,8 @@ def main():
NOUNS = load_obj(path2nouns_list)
VORNAMEN = load_obj(path2firstnameslist)
custom_words = config.get("preprocessing","custom_words").split(",")
filter_tokens = [
# removeENT(["PERSON"]),
keepNouns(NOUNS),
@ -465,13 +380,10 @@ def main():
"categoryName": [removePOS(["SPACE", "PUNCT"])]
}
corpus = preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de",printrandom=5)
#from topicModeling import jgibbsLLDA
#jgibbsLLDA(corpus)
#preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" )
end = time.time()
logprint("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))

View File

@ -1,58 +0,0 @@
# -*- coding: utf-8 -*-
# https://github.com/norvig/pytudes/blob/master/spell.py
"""Spelling Corrector in Python 3; see http://norvig.com/spell-correct.html
Copyright (c) 2007-2016 Peter Norvig
MIT license: www.opensource.org/licenses/mit-license.php
"""
################ Spelling Corrector
import re
from collections import Counter
import spacy
import textacy
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('bigo.txt').read()))
x=0
def P(word, N=sum(WORDS.values())):
"Probability of `word`."
return WORDS[word] / N
def correction(word):
"Most probable spelling correction for word."
return max(candidates(word), key=P)
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))

View File

@ -1,622 +0,0 @@
a
ab
aber
ach
acht
achte
trotz
achten
achter
achtes
ag
alle
allein
allem
allen
aller
allerdings
alles
allgemeinen
als
also
am
an
ander
andere
anderem
anderen
anderer
anderes
anderm
andern
anderr
anders
au
auch
auf
aus
ausser
ausserdem
außer
außerdem
b
bald
bei
beide
beiden
beim
beispiel
bekannt
bereits
besonders
besser
besten
bin
bis
bisher
bist
c
d
d.h
da
dabei
dadurch
dafür
dagegen
daher
dahin
dahinter
damals
damit
danach
daneben
dank
dann
daran
darauf
daraus
darf
darfst
darin
darum
darunter
darüber
das
dasein
daselbst
dass
dasselbe
davon
davor
dazu
dazwischen
daß
dein
deine
deinem
deinen
deiner
deines
dem
dementsprechend
demgegenüber
demgemäss
demgemäß
demselben
demzufolge
den
denen
denn
denselben
der
deren
derer
derjenige
derjenigen
dermassen
dermaßen
derselbe
derselben
des
deshalb
desselben
dessen
deswegen
dich
die
diejenige
diejenigen
dies
diese
dieselbe
dieselben
diesem
diesen
dieser
dieses
dir
doch
dort
drei
drin
dritte
dritten
dritter
drittes
du
durch
durchaus
durfte
durften
dürfen
dürft
e
eben
ebenso
ehrlich
ei
ei,
eigen
eigene
eigenen
eigener
eigenes
ein
einander
eine
einem
einen
einer
eines
einig
einige
einigem
einigen
einiger
einiges
einmal
eins
elf
en
ende
endlich
entweder
er
ernst
erst
erste
ersten
erster
erstes
es
etwa
etwas
euch
euer
eure
eurem
euren
eurer
eures
f
folgende
früher
fünf
fünfte
fünften
fünfter
fünftes
für
g
gab
ganz
ganze
ganzen
ganzer
ganzes
gar
gedurft
gegen
gegenüber
gehabt
gehen
geht
gekannt
gekonnt
gemacht
gemocht
gemusst
genug
gerade
gern
gesagt
geschweige
gewesen
gewollt
geworden
gibt
ging
gleich
gott
gross
grosse
grossen
grosser
grosses
groß
große
großen
großer
großes
gut
gute
guter
gutes
h
hab
habe
haben
habt
hast
hat
hatte
hatten
hattest
hattet
heisst
her
heute
hier
hin
hinter
hoch
hätte
hätten
i
ich
ihm
ihn
ihnen
ihr
ihre
ihrem
ihren
ihrer
ihres
im
immer
in
indem
infolgedessen
ins
irgend
ist
j
ja
jahr
jahre
jahren
je
jede
jedem
jeden
jeder
jedermann
jedermanns
jedes
jedoch
jemand
jemandem
jemanden
jene
jenem
jenen
jener
jenes
jetzt
k
kam
kann
kannst
kaum
kein
keine
keinem
keinen
keiner
keines
kleine
kleinen
kleiner
kleines
kommen
kommt
konnte
konnten
kurz
können
könnt
könnte
l
lang
lange
leicht
leide
lieber
los
m
machen
macht
machte
mag
magst
mahn
mal
man
manche
manchem
manchen
mancher
manches
mann
mehr
mein
meine
meinem
meinen
meiner
meines
mensch
menschen
mich
mir
mit
mittel
mochte
mochten
morgen
muss
musst
musste
mussten
muß
mußt
möchte
mögen
möglich
mögt
müssen
müsst
müßt
n
na
nach
nachdem
nahm
natürlich
neben
nein
neue
neuen
neun
neunte
neunten
neunter
neuntes
nicht
nichts
nie
niemand
niemandem
niemanden
noch
nun
nur
o
ob
oben
oder
offen
oft
ohne
ordnung
p
q
r
recht
rechte
rechten
rechter
rechtes
richtig
rund
s
sa
sache
sagt
sagte
sah
satt
schlecht
schluss
schon
sechs
sechste
sechsten
sechster
sechstes
sehr
sei
seid
seien
sein
seine
seinem
seinen
seiner
seines
seit
seitdem
selbst
sich
sie
sieben
siebente
siebenten
siebenter
siebentes
sind
so
solang
solche
solchem
solchen
solcher
solches
soll
sollen
sollst
sollt
sollte
sollten
sondern
sonst
soweit
sowie
später
startseite
statt
steht
suche
t
tag
tage
tagen
tat
teil
tel
tritt
trotzdem
tun
u
uhr
um
und
und?
uns
unse
unsem
unsen
unser
unsere
unserer
unses
unter
v
vergangenen
viel
viele
vielem
vielen
vielleicht
vier
vierte
vierten
vierter
viertes
vom
von
vor
w
wahr?
wann
war
waren
warst
wart
warum
was
weg
wegen
weil
weit
weiter
weitere
weiteren
weiteres
welche
welchem
welchen
welcher
welches
wem
wen
wenig
wenige
weniger
weniges
wenigstens
wenn
wer
werde
werden
werdet
weshalb
wessen
wie
wieder
wieso
will
willst
wir
wird
wirklich
wirst
wissen
wo
woher
wohin
wohl
wollen
wollt
wollte
wollten
worden
wurde
wurden
während
währenddem
währenddessen
wäre
würde
würden
x
y
z
z.b
zehn
zehnte
zehnten
zehnter
zehntes
zeit
zu
zuerst
zugleich
zum
zunächst
zur
zurück
zusammen
zwanzig
zwar
zwei
zweite
zweiten
zweiter
zweites
zwischen
zwölf
über
überhaupt
übrigens

95963
synsets.xml

File diff suppressed because it is too large Load Diff

1133
test.py

File diff suppressed because it is too large Load Diff

1183
testo.py

File diff suppressed because it is too large Load Diff

610
testra.py
View File

@ -1,610 +0,0 @@
# -*- coding: utf-8 -*-
import re
import time
import json
#import spacy
#import textacy
from functools import reduce
import textacy
start = time.time()
import enchant
from datetime import datetime
import os
import xml.etree.ElementTree as ET
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
from miscellaneous import *
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"
# load config
config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
PARSER=spacy.load("de")
corpi = textacy.Corpus(PARSER)
testcontetn = [
"fdsfdsfsd",
"juzdtjlkö",
"gfadojplk"
]
testmetda = [
{"categoryName":"zhb","Solution":"","Subject":"schulungstest"},
{"categoryName":"neuanschluss","Solution":"subject","Subject":"telephone contract"},
{"categoryName":"zhb","Solution":"","Subject":"setuji"}
]
def makecontent(testcontetn):
for content in testcontetn:
yield content
def makemeta( testmetda):
for metdata in testmetda:
yield metdata
def corpus2Text(corpus):
for doc in corpus:
yield doc.text
corpi.add_texts(
makecontent(testcontetn),
makemeta(testmetda)
)
save_corpus(corpi,corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/test",corpus_name="test")
bla = "uni mail account adresse woche falsch laufen schicken gerne januar betreff herr nachricht gruesse dezernat liebe datum freitag anfrage dienstag unicard karte abholen defekt bibliothek abholung dezember beantragung status gerne portal email nummer service id vorname prozess dez schauen eg rechner mitarbeiterin benutzerkonto oktober wissenschaftliche projekt fr download hilfskraft verantwortliche link dringend antrag schnelle arbeitsplatz november admin rahmen stand geschickt server outlook ordner bild konto postfach campus hi ueberpruefung sued beste daten freuen semester login benutzer gerne erstellen stelle frage system boss moeglichkeit student schoen spam alias geld vertrag juni ansprechpartner telefon raum einrichtung gebaeude telefonbuch abteilung element eintrag nutzer raum pc gerne lehrstuhl voraus fakultaet verfuegung herzliche drucker erreichen tlaptop kabel problem klaerung url adapter feedback koeln grundsaetzlich kaufmann problem fehler verbindung anhang meldung client netz netzwerk wenden funktionieren liebe mitarbeiter unterstuetzung aktuell herr benoetigt raumplanung gb weber vorab ueckmeldung software lizenz programm kurze urlaub gerne installation dankbar informieren team service problem loesung bestellung verlaengern verteiler alte aendern februar oeffnen update pdf browser notwendig fenster schulung beginn wege nord tkurs frage studierende personen teilnehmer standort gerne herunterladen voraus zusenden ews veranstaltung datei iso text umstellung absender message date html arbeit kaiser erfolgreich thema ablauf art at einfuehrung umfrage cloud zugang zugreifen montag probleme kollegin profil server handy web file ticket drucker einrichten senden nr mittwoch card mitteilen nrw kontakt mail fax universitaet it institut hardware hinweis fakultaet not strasse loeschen liste funktion auftrag zeitraum verwaltung angebot vorgehen entfernen moeglichkeit gefunden benutzername informatik gruppe eingabe nachname chemie dame b. angepasst name schoene abt post zukommen verlaengerung sommersemester fehlen namensaenderung auskunft tu dr prof pruefung herr namen fakultaet bereich lehrstuhl installieren buero ok anschluss maerz theologie notebook herr berechtigung master vorbeikommen passwort anmelden account hilfe helfen uniaccount anmeldung kennwort problem boss zugriff referat screenshot support laufwerk bildschirm super tastatur button auswaehlen"
bla = bla.split()
print(len(bla))
print(len(set(bla)))
print()
x = {'a':1, 'b': 2}
y = {'b':10, 'c': 11}
z = x.update(y)
print(x)
"""
#save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name)
#textacy.fileio.write_file_lines(corpus2Text(corpi), filepath=corpus_de_path+"plain.txt")
dict = {"unicard redaktionsteam": 189, "kms": 131, "itmc_st\u00f6rungen": 17, "benutzerverwaltung_probleme": 168, "mailverteiler exchange": 130, "beamer": 70, "cws_confluence": 190, "benutzerverwaltung": 26, "sos": 166, "virtuelle server": 116, "sap": 7, "wlan": 21, "lsf": 6, "gastaufenthalt": 8, "umzug": 5, "firewall betreuung": 129, "ausleihe": 39, "fiona": 10, "kursplanung": 195, "schulungsraum verwaltung": 200, "plagiatserkennung": 32, "designentwicklung": 100, "ub basis it": 184, "tsm": 51, "backup tsm": 110, "raumkalender": 174, "veeam": 149, "linux bs": 42, "hochleistungsrechnen": 90, "e learning": 37, "h\u00f6rsaal\u00fcbertragung": 52, "sophos": 88, "service portal redaktion": 182, "verkauf": 93, "fk 16": 30, "campus app": 54, "dns": 71, "kurse": 196, "itmc schulungsr\u00e4ume": 96, "leitung": 91, "telefon": 14, "housing": 135, "softwarelizenzen": 35, "hcm stammdaten": 68, "semesterticket": 197, "exchange nutzung": 33, "mediendienste": 167, "sam spider": 172, "pvp": 27, "webserver": 29, "werkvertr\u00e4ge": 158, "ibz raumbuchung": 177, "webmailer": 126, "unicard sperrung": 64, "cd dvd produktion": 114, "lizenzserver": 92, "pr\u00fcfungsmanagement": 38, "blogs wikis foren": 87, "unicard ausgabe": 161, "pools": 157, "desktop & basisdienste": 144, "antrag auf rechnungserstellung": 193, "mailalias": 121, "evaexam": 133, "neuanschluss": 0, "mobilfunkvertr\u00e4ge": 69, "ftp server": 191, "haustechnik": 77, "raumbuchungssysteme": 186, "confluence": 181, "uniaccount zugangsdaten": 47, "itmc medienr\u00e4ume ef50": 171, "dokoll support": 128, "elektronisches telefonbuch": 3, "softwareverteilung": 153, "overhead projektor": 104, "sicherheit": 145, "itmc_als": 48, "itmc pools": 160, "zhb": 60, "serversupport": 101, "veranstaltungen": 61, "fk12 webauftritt": 138, "hardware": 142, "unicard produktion": 156, "telefonkonferenzen": 170, "dhcp": 188, "zertifikate server dfn": 139, "lan": 1, "datanet": 49, "neuausstattung": 173, "moodle": 16, "abmeldung": 13, "uni mail": 15, "medienr\u00e4ume ef50": 117, "verschiedene aufgaben": 40, "zentrale webserver": 75, "vorlesungsaufzeichnung": 152, "grafik": 132, "campus management": 72, "hacker angriff": 46, "pos": 23, "zugangsdaten": 41, "serviceportal": 63, "ews": 24, "voicemail box": 150, "service desk itmc": 74, "test": 180, "beschaffung": 57, "bestellung": 185, "vpn": 55, "app feedback": 66, "allgemein": 134, "rundmail": 105, "telefonabrechnung": 199, "limesurvey": 31, "unicard": 28, "eldorado": 140, "uniaccount": 12, "plotter": 125, "mdm mobile device management": 120, "namens\u00e4nderung": 43, "sd": 84, "basis applikationen": 103, "\u00e4nderung": 194, "fileserver einrichtung": 187, "fk14_test": 154, "werkst\u00e4tte": 179, "itmc_aufgaben": 45, "formulare antr\u00e4ge": 81, "facility": 192, "web": 169, "asknet": 136, "server storage": 113, "mail groupware": 20, "rektorat -b\u00fcro": 178, "office": 50, "werkstoffe lehrstuhl bauwesen": 59, "telefonzentrale": 115, "verwaltung": 4, "netze": 22, "beantragung": 82, "d.3 dms": 148, "redmine projektverwaltung": 141, "wsus": 106, "lido": 118, "rechnerr\u00e4ume": 143, "matrix42_hilfe": 18, "boss service desk": 44, "konteneinsicht": 62, "spam phishing": 53, "forensic": 164, "fk 12": 11, "benutzungsverwaltung": 198, "redmine": 79, "basis app": 85, "viren": 95, "fk12 migration": 155, "raumbuchung": 109, "virtuelle desktops citrix": 176, "outlook_einrichtung": 123, "kundenserver": 137, "nrw ticket": 80, "weiterentwicklung": 127, "siport zugangskontrolle": 98, "e mail dienste": 99, "vorlagenerstellung": 36, "video": 19, "studierendensekretariat": 111, "it sicherheit sic": 86, "boss": 25, "technik": 58, "dokoll pvp": 112, "betrieb": 2, "v2 campus app feedback": 151, "mailverteiler": 108, "videoschnitt": 119, "fk raumplanung 09": 9, "sap urlaub": 73, "keine r\u00fcckantwort": 124, "prozess- und projektmanagement": 67, "dienstreise": 34, "webgestaltung": 78, "schulung": 175, "software": 89, "medientechnik": 76, "servicedesk": 107, "service portal": 94, "software entwicklung": 165, "uniflow": 159, "ub_st\u00f6rungen": 162, "fk15": 183, "uhren": 83, "entwicklung": 163, "videokonferenzen": 97, "itmc webauftritt": 102, "joomla itmc website": 147, "changes": 122, "visitenkartenproduktion": 65, "lizenzmanagement": 146, "tonerb\u00f6rse": 201, "arbeitsplatzsupport": 56}
list = [(key,value) for key,value in dict.items()]
list.sort(key=lambda tup : tup[1])
"""
"""
from spacy.tokens.doc import Doc as SpacyDoc
filepath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/de_clean_ticket_content.bin"
# load parser
parser = spacy.load("de")
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
stringstorepath = corpus_path + 'de_parser/vocab/strings.json'
with open(stringstorepath) as file:
parser.vocab.strings.load(file)
vocabpath = Path(corpus_path + 'de_parser/vocab/lexemes.bin')
parser.vocab.load_lexemes(vocabpath)
spacy_vocab = parser.vocab
def readCorpus(filepath):
with open_sesame(filepath, mode='rb') as f:
for bytes_string in SpacyDoc.read_bytes(f):
yield SpacyDoc(spacy_vocab).from_bytes(bytes_string).text
textacy.fileio.write_file_lines(readCorpus(filepath),"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/result.txt")
"""
# load raw corpus and create new one
#raw_corpus, parser = load_corpusV2(corpus_name=rawCorpus_name, corpus_path=corpus_de_path)
#printRandomDoc(raw_corpus)
"""
spacy_doc = PARSER("test")
save_obj(spacy_doc, "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
spacy_doc2 = load_obj("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
print("Doc: {0}".format(spacy_doc2))
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/"
LLDA_filepath = "{0}labeldict.txt".format(jgibbsLLDA_root)
laveldict = {'fiona': 10, 'vorlagenerstellung': 36, 'webserver': 29, 'matrix42_hilfe': 18, 'sap': 7, 'pos': 23, 'verwaltung': 4, 'lan': 1}
with open(LLDA_filepath, 'w') as file:
file.write(json.dumps(laveldict))
"""
"""
def load_corpus(corpus_path, corpus_name, lang="de"):
from pathlib import Path
# load parser
parser = spacy.load(lang)
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
with open(stringstorepath) as file:
parser.vocab.strings.load(file)
vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
parser.vocab.load_lexemes(vocabpath)
corpus = textacy.Corpus(parser)
contentpath = corpus_path + corpus_name + "_content.bin"
metapath = corpus_path + corpus_name + "_meta.json"
metadata_stream = textacy.fileio.read_json_lines(metapath)
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpus.add_doc(
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
return corpus
"""
"""
# THESAURUS
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
def build_thesaurus(path2lexicalentries):#, path2synsets):
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
lexroot = lextree.getroot()
#synroot = syntree.getroot()
word2synsets = {}
template = {"w1": ["s1", "s2"]}
for ro in lexroot:
for elem in ro:
if elem.tag == "LexicalEntry":
lex_dictlist = [subentry.attrib for subentry in elem]
synlist = []
string = "WORD"
for lex_dict in lex_dictlist:
if "synset" in lex_dict.keys():
synset = lex_dict["synset"]
synlist.append(synset)
if 'writtenForm' in lex_dict.keys():
string = (lex_dict["writtenForm"])
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# alle punkte raus
string = re.sub(r'[.]', "", string)
# alles in klammern raus
string = re.sub(r"\((.*)\)", " ", string)
# längeres leerzeichen normalisieren
string = textacy.preprocess.normalize_whitespace(string)
string = string.lower().strip()
word2synsets[string] = synlist
synset2Words = {}
template = {"s1": ["w1","w2"]}
for word,synset in word2synsets.items():
for syn in synset:
if syn not in synset2Words.keys():
synset2Words[syn] = [word]
else:
synset2Words[syn].append(word)
# nach anzhal der wörter in den strings sortieren
for synset in word2synsets.values():
synset.sort(key=lambda x: len(x.split()))
thesaurus = {}
thesaurus_template = {"w1" : "mainsyn"}
for word,synset in word2synsets.items():
try:
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
except:
pass
return thesaurus
for r in synroot:
for element in r:
if element.tag == "Synset":
synset = []
attrib = element.attrib
id = attrib["id"]
if id not in synset2Words.keys():
synset2Words[id] = "WORD"
"""
"""
from postal.parser import parse_address
address = "Nicolas Rauner LS Biomaterialien und Polymerwissenschaften Fakultät Bio- und Chemieingenieurwesen TU Dortmund D-44227 Dortmund Tel: + 49-(0)231 / 755 - 3015 Fax: + 49-(0)231 / 755 - 2480"
print(parse_address(address))
address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund "
print(parse_address(address))
"""
"""
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
corpus_name = "testcorpus"
#corpi.save(corpus_path, name=corpus_name, compression=corpus_compression)
#corpi = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
import pathlib
strings_path = pathlib.Path(corpus_path + 'strings.json')
path_lexemes_bin_ = pathlib.Path(corpus_path + 'lexemes.bin')
PARSER.vocab.dump(path_lexemes_bin_)
nlp.vocab.load_lexemes(path_lexemes_bin_)
def save_corpus(corpus_path,corpus_name):
# save stringstore
stringstore_path = corpus_path + corpus_name + '_strings.json'
with open(stringstore_path, "w") as file:
PARSER.vocab.strings.dump(file)
#save content
contentpath = corpus_path + corpus_name+ "_content.bin"
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpi),contentpath)
#save meta
metapath = corpus_path + corpus_name +"_meta.json"
textacy.fileio.write_json_lines((doc.metadata for doc in corpi), metapath)
def load_corpus(corpus_path,corpus_name):
# load new lang
nlp = spacy.load("de")
#load stringstore
stringstore_path = corpus_path + corpus_name + '_strings.json'
with open(stringstore_path,"r") as file:
nlp.vocab.strings.load(file)
# define corpi
corpi = textacy.Corpus(nlp)
# load meta
metapath = corpus_path + corpus_name +"_meta.json"
metadata_stream = textacy.fileio.read_json_lines(metapath)
#load content
contentpath = corpus_path + corpus_name+ "_content.bin"
spacy_docs = textacy.fileio.read_spacy_docs(corpi.spacy_vocab, contentpath)
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
corpi.add_doc(
textacy.Doc(spacy_doc, lang=corpi.spacy_lang, metadata=metadata))
return corpi
save_corpus(corpus_path,corpus_name)
print(load_corpus(corpus_path,corpus_name))
"""
"""
def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):
#return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc]))
def getFirstSynonym(word, thesaurus, default_return_first_Syn=False):
if not isinstance(word, str):
return str(word)
word = word.lower()
# durch den thesaurrus iterieren
for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen
for syn in syn_block:
syn = syn.lower()
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
if word == syn:
return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
else: # falls es ein satz ist
if word in syn:
return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
return str(word) # zur Not, das ursrpüngliche Wort zurückgeben
def getHauptform(syn_block, word, default_return_first_Syn=False):
for syn in syn_block:
syn = syn.lower()
if "hauptform" in syn and len(syn.split(" ")) <= 2:
# nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
for w in syn.split(" "):
if not re.match(r'\([^)]+\)', w):
return w
if default_return_first_Syn:
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
for w in syn_block:
if not re.match(r'\([^)]+\)', w):
return w
return word # zur Not, das ursrpüngliche Wort zurückgeben
"""
"""
path2xml="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml"
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
root = tree.getroot()
for r in root:
for element in r:
if element.tag == "Synset":
attrib = element.attrib
for i,subentry in enumerate(element):
if subentry.tag == "Lemma" and subentry.attrib["partOfSpeech"] == "n":
string = (subentry.attrib["writtenForm"])
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
# seperate_words_on_regex:
string = " ".join(re.compile(regex_specialChars).split(string))
string_list=string.split()
if len(string_list) == 1:
nomen.append(string.lower().strip())
"""
"""
import re
from collections import Counter
def words(text): return re.findall(r'\w+', text.lower())
WORDS = Counter(words(open('/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt').read()))
def P(word, N=sum(WORDS.values())):
"Probability of `word`."
return WORDS[word] / N
def correction(word):
"Most probable spelling correction for word."
return max(candidates(word), key=P)
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
"""
"""
### extract from derewo
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")
for line in raw:
line_list=line.split()
if line_list[2] == "NN":
string = line_list[1].lower()
# replaceRockDots
string = re.sub(r'[ß]', "ss", string)
string = re.sub(r'[ö]', "oe", string)
string = re.sub(r'[ü]', "ue", string)
string = re.sub(r'[ä]', "ae", string)
nomen.append(string.lower().strip())
textacy.fileio.write_file_lines(nomen,"nomen2.txt")
"""
"""
stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";")
content_collumn_name = "Description"
content_collumn = 9 # standardvalue
de_tickets=[]
en_tickets=[]
misc_tickets=[]
error_count = 0
for i, lst in enumerate(stream):
if i == 0:
de_tickets.append(lst)
en_tickets.append(lst)
misc_tickets.append(lst)
else:
try:
content_collumn_ = lst[content_collumn]
if detect(content_collumn_) == "de":
de_tickets.append(lst)
elif detect(content_collumn_) == "en":
en_tickets.append(lst)
else:
misc_tickets.append(lst)
except:
misc_tickets.append(lst)
error_count += 1
print(error_count)
textacy.fileio.write_csv(de_tickets,"M42-Export/de_tickets.csv", delimiter=";")
textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";")
textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";")
"""
"""
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
def stringcleaning(stringstream, funclist):
for string in stringstream:
for f in funclist:
string = f(string)
yield string
def seperate_words_on_regex(regex=regex_specialChars):
return lambda string: " ".join(re.compile(regex).split(string))
words = [
"uniaccount",
"nr54065467",
"nr54065467",
"455a33c5,"
"tvt?=",
"tanja.saborowski@tu-dortmund.de",
"-",
"m-sw1-vl4053.itmc.tu-dortmund.de",
"------problem--------"
]
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
print(s.strip())
#print(stringcleaning(w,string_comp))
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
#result = specialFinder.sub(" ", w)
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
"""
"""
def replaceRockDots():
return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
de_stop_words = list(textacy.fileio.read_file_lines(filepath="german_stopwords_full.txt"))
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
#print(blob.entities)
de_stop_words = list(map(replaceRockDots(),de_stop_words))
#LEMMAS = list(map(replaceRockDots(),LEMMAS))
#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
"""
end = time.time()
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))

View File

@ -1,227 +0,0 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<verzeichnis>
<ticket>
<Zusammenfassung>Telephone Contract</Zusammenfassung>
<Kategorie>Neuanschluss</Kategorie>
<Beschreibung>
Telefon-Neuanschluss
Antragsteller:
Melanie Hinrichs
melanie.hinrichs@tu-dortmund.de
 
 
 
Terminvorschlag unbestimmt
Einrichtung Dezernat 3
Abteilung Abteilung 2
PSP Element L-11-10000-100-302300
UniAccount myvowest(Westerdorf, Yvonne)
Gebäude Pavillon 8
Raum ID 031 (63292)
Telefondose keine vorhanden
Telefonnr. -
Eintrag Telefonbuch
E-Mail melanie.hinrichs@tu-dortmund.de
Voicemail Nicht erwünscht
Ansprechpartner Melanie Hinrichs
Tel. Ansprechpartner 5848
Verantwortlicher Nutzer -
Type Amt
Bemerkung:
Es wird ein Telefon benötigt,ein Telefon mit 6 Speicherpl.f.die Gruppenfunktion ist ausreichend. Die Möbel werden am 10.06.2015 aufgestellt.Weder Netzwerkdose noch Telefondose vorhanden. Dez.6 hat Vorbereitungen getroffen.
</Beschreibung>
<Loesung>Frau Hinrichs überdenkt die Situation und macht dann neue Anträge.
Dieses Ticket wird geschlossen</Loesung>
</ticket>
<ticket>
<Zusammenfassung>LSF/BOSS Datenexport</Zusammenfassung>
<Kategorie>LSF</Kategorie>
<Beschreibung>Sehr geehrter ITMC Service,
ich muss mir jedes Semester meine Veranstaltungen für das kommende
Semester zusammen suchen und stelle mir die Frage "Wie weit bin ich mit
meinem Studium? Welche Module kann ich wann belegen?". Gerade bei
mehreren Wahlmodulen gestaltet sich dies ja doch etwas schwieriger.
Daher möchte ich gerne, zunächst als experimentelle Privatprojekt, eine
leichtgewichtige Webseite erstellen, die mir dabei helfen soll. Meine
Vision ist, dies in weiteren Stufen meinen Kommilitonen der
Informatik-Fakultät und danach allen Studierenden zu Verfügung zu stellen.
Statt "das Rad neu zu erfinden" möchte ich einfach eine andere
Sichtweise auf die Daten in LSF und BOSS schaffen.
Zentraler Aspekt ist hier der Studienplan des Studiengangs, der
gleichzeitig eine Übersicht über noch zu erledigende Module und die
aktuelle Gesamt-Creditzahl liefern soll. Diese Ansicht soll auch dazu
dienen festzulegen, in welchem Semester man welches Modul machen möchte.
Darauf aufbauend möchte ich in einer nächsten Stufe gerne detaillierte
Veranstaltungsinformationen, ähnlich dem LSF, in einer
Facettensuche(ähnlich anzeigen. Dadurch sollen diese nach Studiengang,
Fakultät, Dozent, Semester, Turnus, etc. durchsuchbar werden.
Um den Studenten eine Orientierung zu liefern, wäre es zudem in einer
zukünftigen Version vorstellbar, den Studenten anhand der Studienpläne
und Modulabhängigkeiten (vorausgesetzte/erwünschte Kentnisse)
Veranstaltungen für das kommende Semester vorzuschlagen und
automatisiert Stundenpläne zu erstellen.
Daher möchte ich erfragen, ob
- es möglich ist einen Datenbank-Dump der Veranstaltungs-Basisdaten(z.B.
Titel, Dozent, Turnus, Uhrzeit, Beschreibung, etc.) zu erhalten
- das LSF und/oder das BOSS eine Programmierschnittstelle zur
Datenabfrage haben, welche nicht auf einem Login mit
Benutzername/Passwort basiert
- es möglich ist für einzelne Benutzer mit deren Erlaubnis eine Liste
aller Studienleistungen inkl. Veranstaltungs/BOSS-Nummer in einem
maschinenlesbaren Format (z.B. CSV oder XML, nicht PDF) abzufragen
Falls Sie noch offene Fragen haben, lassen Sie es mich wissen. Gerne
können wir diese auch in einem persönlichen Gespräch klären.
Vielen Dank!
 
Mit freundlichen Grüßen,
Tobias Brennecke
</Beschreibung>
<Loesung>alt</Loesung>
</ticket>
<ticket>
<Zusammenfassung>Zurücksetzung Passwort BOSS</Zusammenfassung>
<Kategorie>ITMC_Störungen</Kategorie>
<Beschreibung>Hallo.
Bitte setzen Sie mein Passwort zurück.
Ich würde gerne eine neues wählen.
Mit freundlichen Grüßen,
Ahmann.
IMAP0013 OK Completed (0.000 sec
IMAP0013 OK Completed (0.000 sec
IMAP0013 OK Completed (0.000 sec</Beschreibung>
<Loesung>können Sie sich im Service Portal einloggen?
Wenn ja, dann löschen Sie Ihre Cookies und den Cache.
Anschließend sollte auch die BOSS Anmeldung klappen.
Verwenden Sie Firefox oder Chrome.
Achten Sie darauf, dass der Account klein geschrieben ist, wenn sie sich mit einem Mobilgerät einloggen.
Sollte die Anmeldung im Service Portal nicht funktionieren, dann können Sie persönlich im Service Desk vorbeikommen und gegen Vorlage Ihres Personalausweises/Unicard Ihre Anmelde-Daten erhalten. Auch können wir Ihnen Ihre Zugangsdaten per Post zuschicken. Dazu müssen Sie allerdings ein paar Sicherheitsfragen beantworten:
1. Wie lautet Ihr Unimail-Namenskürzel (beginnend mit 'm' oder 'sm')
2. Wie lautet Ihre Matrikel-Nummer?
3. Wie lautet Ihr Geburtsdatum?
4. Wie lautet Ihre hinterlegte Post-Adresse?
5. Wie lautet die Antwort auf Ihre Sicherheitsfrage Geburtsname der Mutter?
6. Wie lautet Ihre aktuelle Post-Adresse?
</Loesung>
</ticket>
<ticket>
<Zusammenfassung>Forschungsantrag - Geräteanfrage</Zusammenfassung>
<Kategorie>Video</Kategorie>
<Beschreibung>Sehr geehrtes ITMC-Team,
für einen Forschungsantrag benötige ich einige technische Informationen
und bitte Sie herzlich um Unterstützung:
Zur Unterrichtsaufzeichnung möchte ich gern mit GoPro-Kameras arbeiten.
Ich möchte 4 Kameras beantragen. Könnten Sie mich beraten, welche
zusätzlichen Geräte man benötigt, um die Informationen dann für Lehre und
Forschung zu verarbeiten? Ich bin nicht sicher: gibt es Geräte, die eine
parallele Betrachtung ermöglichen? Benötigt man zusätzliche
Speicherkapazitäten? Sehr dankbar wäre ich, wenn Sie die Infos gleich mit
Kostenkalkulationen für den Antrag verbinden könnten.
Eine weitere Frage gleich nebenbei: Wird es an der TU auch die Möglichkeit
geben, in den Hörsälen direkt zentral Podcasts der Vorlesungen
aufzuzeichnen? Die Kollegen an der RUB verfügen über diese Möglichkeit
jenseits individueller Camtasia-Aufzeichnungen. Dort wird das zentral und
standardmäßig gemacht.
Ich arbeite momentan vom heimischen Schreibtisch aus. Sollten sie
Rückfragen telefonisch machen wollen, erreichten Sie mich unter
02302-9147798.
Ganz herzlichen Dank für Ihre Unterstützung!
Mit herzlichen Grüßen
Gudrun Marci-Boehncke
Prof. Dr. Gudrun Marci-Boehncke
TU Dortmund
Institut für deutsche Sprache und Literatur
Emil-Figge Str. 50, 3.241
44227 Dortmund
IMAP0013 OK Completed (0.000 sec
IMAP0013 OK Completed (0.000 sec
IMAP0013 OK Completed (0.000 sec
</Beschreibung>
<Loesung>Problem wurde telefonisch besprochen und eine Beratung ist dort erfolgt. Weitere Kommunikation erfolgt via eMail.</Loesung>
</ticket>
</verzeichnis>

View File

@ -3,6 +3,7 @@
from datetime import datetime
import time
import numpy as np
import csv
import sys
@ -20,8 +21,6 @@ import os
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"
# load config
config_ini = FILEPATH + "config.ini"
@ -35,12 +34,12 @@ def label2ID(label, labeldict):
return labeldict.get(label, len(labeldict))
def generate_labled_lines(textacyCorpus, labeldict):
def generate_lablelID_lines(textacyCorpus, labeldict):
for doc in textacyCorpus:
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text
"""
def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
logprint(str("ngrams: {0}".format(ngrams)))
logprint(str("min_df: {0}".format(min_df)))
@ -58,37 +57,47 @@ def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf
print(t)
logprint("doc_term_matrix: {0}".format(doc_term_matrix))
logprint("id2term: {0}".format(id2term))
"""
def textacyTopicModeling(corpus,
n_topics = 15, top_topic_words = 7, top_document_labels_per_topic = 5,
ngrams = 1, min_df=1, max_df=1.0,
topicModel='lda'):
n_terms = int(n_topics * top_topic_words)
sort_terms_by = 'seriation' # 'seriation', 'weight', 'index', 'alphabetical'
rank_terms_by = 'corpus' # 'corpus', 'topic'
def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda', named_entities=False):
logprint(
"############################################ Topic Modeling {0} #############################################".format(
"############### Topic Modeling {0} ###########################".format(
topicModel))
print("\n\n")
logprint(str("ngrams: {0}".format(ngrams)))
logprint(str("min_df: {0}".format(min_df)))
logprint(str("max_df: {0}".format(max_df)))
logprint(str("n_topics: {0}".format(n_topics)))
logprint(str("named_entities: {0}".format(named_entities)))
logprint("\n")
start = time.time()
top_topic_words = 7
top_document_labels_per_topic = 5
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
####################'####################
# printlog("vectorize corpi...")
#################### vectorize corpi ####################
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus)
doc_term_matrix = vectorizer.fit_transform(terms_list)
id2term = vectorizer.__getattribute__("id_to_term")
@ -97,44 +106,40 @@ def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='l
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
# Initialize and train a topic model
# printlog("Initialize and train a topic model..")
##################### Initialize and train a topic model ##############################################
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
model.fit(doc_term_matrix)
# Transform the corpi and interpret our model:
# printlog("Transform the corpi and interpret our model..")
doc_topic_matrix = model.transform(doc_term_matrix)
print()
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
print()
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
logprint(topic_idx)
for j in top_docs:
logprint(corpus[j].metadata['categoryName'])
print()
#####################################################################################################################
print()
print()
# termite plot
n_terms = int(n_topics*top_topic_words)
sort_terms_by = 'seriation' #'seriation', 'weight', 'index', 'alphabetical'
rank_terms_by = 'corpus' # 'corpus', 'topic'
####################### termite plot ###################################################################
grams_label = "uni" if ngrams == 1 else "bi"
model.termite_plot(doc_term_matrix, id2term,
n_terms=n_terms,
sort_terms_by=sort_terms_by,
rank_terms_by=rank_terms_by+'_weight',
save="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/results/{}_{}_{}_{}_{}.png".format(topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by))
save= FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label,topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by))
@ -142,48 +147,51 @@ def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='l
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=False):
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
def jgibbsLLDA(corpus, path2save_results, top_topic_words=7):
start = time.time()
# build dictionary of ticketcategories
labelist = []
jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
# build dictionary of ticketcategories
labelist = []
for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
labelist.append(texdoc.metadata["categoryName"])
labeldict = {k: v for v, k in enumerate(labelist)}
reverse_labeldict = {v: k for k, v in labeldict.items()}
if add_default_topic:
n_topics = len(labeldict) + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
else:
n_topics = len(labeldict) # + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/"
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
#dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
dict_path = FILEPATH +"results/labeldict.txt".format(jgibbsLLDA_root)
# printlog(str("LABELDICT: {0}".format(labeldict)))
#logprint(str("LABELDICT-length: {0}".format(len(labeldict))))
with open(dict_path, 'w') as file:
#and save
labeldict_path = FILEPATH + "results/labeldict.txt"
with open(labeldict_path, 'w') as file:
file.write(json.dumps(labeldict))
# for line in generate_labled_lines(de_corpus,labeldict):
# print(line)
# create file
textacy.fileio.write_file_lines(generate_labled_lines(corpus, labeldict), filepath=LLDA_filepath)
n_topics = len(labeldict) #+1 #default-topic
# create file with label_IDs (input for llda)
textacy.fileio.write_file_lines(generate_lablelID_lines(corpus, labeldict), filepath=LLDA_filepath)
# wait for file to exist
while not os.path.exists(LLDA_filepath):
time.sleep(1)
#top_topic_words=1
logprint("")
logprint("start LLDA:")
# run JGibsslda file
# run JGibbsLLDA file
FNULL = open(os.devnull, 'w') # supress output
cmd_jgibbs_java = ["java", "-cp",
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
@ -193,44 +201,20 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F
"-twords", str(top_topic_words), "-ntopics", str(n_topics)]
subprocess.call(cmd_jgibbs_java, stdout=FNULL)
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
# twords
"""
subprocess.call(["gzip",
"-dc",
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
"""
cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
"""
proc = subprocess.Popen(cmd_gzip, stdout=subprocess.PIPE)
process = subprocess.Popen(cmd_gzip, shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
# wait for the process to terminate
out, err = process.communicate()
errcode = process.returncode
result = subprocess.check_output(cmd_gzip)
#result = proc.stdout.read()
result = proc.communicate()
out=[]
for line in result:
out.append(line)
"""
output = subprocess.check_output(cmd_gzip).decode("utf-8")
reverse_labeldict = {v: k for k, v in labeldict.items()}
result = []
regex = re.compile(r'Topic [0-9]*')
for line in output.splitlines():
findall = regex.findall(line)
topic_regex = re.compile(r'Topic [0-9]*')
#####################################
# todo save results in file aufgrund von results
result = []
for line in output.splitlines():
findall = topic_regex.findall(line)
if len(findall) != 0:
try:
index = int(findall[0].split()[1])
@ -242,67 +226,136 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F
else:
result.append(line)
textacy.fileio.write_file_lines(result, path2save_results)
#####################################################################################################################
textacy.fileio.write_file_lines(result, path2save_results+".txt")
#####################################
#todo llda termite plot
"""
topic_inds=[] #<class 'list'>: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
results = []
res_dict = {}
count =0
for line in output.splitlines():
# get topic and term labels
# <class 'tuple'>: ('topic 0', 'topic 1', 'topic 2', 'topic 3', 'topic 4', 'topic 5', 'topic 6', 'topic 7', 'topic 8', 'topic 9', 'topic 10', 'topic 11', 'topic 12', 'topic 13', 'topic 14')
topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds)
findall = topic_regex.findall(line)
# <class 'tuple'>: ('hardware', 'raum', 'adresse', 'gebaeude', 'tu', 'uni', 'ticket', 'email', 'account', 'nummer', 'mail', 'outlook', 'karte', 'drucker', 'server', 'service', 'antwort', 'verbindung', 'herzliche', 'einrichten', 'vergessen', 'wenden', 'ews', 'anhang', 'form', 'konto', 'nachricht', 'unterstuetzung', 'passwort', 'unicard', 'semester', 'system', 'aenderung', 'rueckmeldung', 'meldung', 'zugreifen', 'login', 'adressat', 'sender', 'kurs', 'made', 'mittwoch', 'note', 'our', 'korrespondenz', 'unbeschadet', 'boss', 'unterrichten', 'telefax', 'zugang', 'probleme', 'zugriff', 'mitarbeiterin', 'internet', 'daten', 'anmeldung', 'aendern', 'unterschrift', 'loeschen', 'anmelden', 'datei', 'message', 'laptop', 'benoetigt', 'link', 'montag', 'programm', 'ordner', 'personal', 'rechner', 'veranstaltung', 'august', 'lizenz', 'anschluss', 'mitarbeiter', 'erwuenscht', 'umzug', 'pc', 'uniaccount', 'amt', 'fax', 'it', 'institut', 'nutzer', 'bild', 'type', 'prof', 'verantwortlicher', 'bemerkung', 'antragsteller', 'element', 'hahn', 'eintrag', 'telefonbuch', 'ansprechpartner', 'universitaet', 'physik', 'abteilung', 'fakultaet', 'software', 'dezernat', 'einrichtung', 'telefon', 'lehrstuhl', 'buero')
term_labels = tuple(id2term[term_ind] for term_ind in term_inds)
if len(findall) != 0:
if len(res_dict) != 0:
results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig)
index = int(findall[0].split()[1])
res_dict = {index : str(reverse_labeldict[index]) }
else:
splitted = line.split()
res_dict[splitted[0]] = float(splitted[1])
### print terms that are topics
for s in list(res_dict.values()):
if isinstance(s,str) and splitted[0] in s:
vals = list(res_dict.values())
keys = list(res_dict.keys())
for v in vals:
if not isinstance(v,float):
print("{}".format(v))
print("{}".format(splitted[0]))
count +=1
print()
###
if len(res_dict) != 0:
results.append(res_dict) # letzes an die liste ran
print(count)
print(float(count)/float(len(labelist)))
# {0: 'betrieb', 'service': 0.24162679425837305, 'support': 0.24162679425837305, 'browser': 0.24162679425837305, 'unicard': 0.24162679425837305, 'telefon': 0.0023923444976076593}
# every term in the resulsts to a list
terms=[]
for res in results:
for key,value in res.items():
if not isinstance(key, int) and not key in terms:
terms.append(key)
term2id = {t:i for i,t in enumerate(terms)} #and to dict
################# termite plot #####################################################################
#term_topic_weights.shape = (len(term_ids),len(topic_ids)
#topic_labels = tuple(labelist)
topic_labels = list(range(len(labelist)))
term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])
term_topic_weights = np.zeros((len(term2id),len(topic_labels)))
for i,res in enumerate(results):
for key,value in res.items():
if not isinstance(key, int):
term_topic_weights[term2id[key]][i] = value
term_labels[term2id[key]] = key
else:
topic_labels[i] = reverse_labeldict[key]
# get topic-term weights to size dots
#[[ 0.02721858 -0.03898025 0.00047936 ..., 0.05862538 -0.07742336 0.04761928]
# [ 0.14977875 -0.24192522 -0.00620335 ..., -0.0497216 0.08269951 -0.05715901]
# [ 0.04977951 0.02296709 0.01214562 ..., 0.11444371 -0.15212482 0.21481788]
# ...,
# [
term_topic_weights = np.array([self.model.components_[topic_ind][term_inds]
for topic_ind in topic_inds]).T
viz.draw_termite_plot(
term_topic_weights, topic_labels, term_labels, save=path2save_results)
"""
logprint("")
term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
end = time.time()
logprint("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60))
logprint("Time Elapsed Topic Modeling JGibbsLLDA:{0} min\n".format((end - start) / 60))
def main(use_raw=False, algorithm="llda"):
logprint("Topic Modeling: {0}".format(datetime.now()))
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
if use_raw:
# fehler Unknown document label ( X ) for document 352.
preCorpus_name = "de" + "_raw_ticket"
resultspath = FILEPATH + "results/raw"
else:
preCorpus_name = "de" + "_pre_ticket"
resultspath = FILEPATH + "results/pre"
def main(use_cleaned=False, algorithm="llda"):
# load raw corpus and create new one
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
logprint("Corpus loaded: {0}".format(de_corpus.lang))
# idee http://bigartm.org/
# idee http://wiki.languagetool.org/tips-and-tricks
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics
# idee https://gate.ac.uk/family/
logprint("Topic Modeling: {0}".format(datetime.now()))
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
if use_cleaned:
preCorpus_name = "de" + "_clean_ticket"
resultspath = FILEPATH + "results/clean"
else:
preCorpus_name = "de" + "_pre_ticket"
resultspath = FILEPATH + "results/pre"
# load cleand corpus
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
logprint("Corpus loaded: {0}".format(de_corpus.lang))
# todo llda topics zusammenfassen
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
# frage lda wieviele tickets pro topic?
# frage wieviele tickets pro topic?
"""
ngrams = 1
@ -324,47 +377,26 @@ def main(use_raw=False, algorithm="llda"):
if algorithm == "llda":
top_topic_words = 5
add_default_topic = False
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
top_topic_words = 5
add_default_topic = True
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words))
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
"""
top_topic_words = 10
add_default_topic = False
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
top_topic_words = 10
add_default_topic = True
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
add_default_topic=add_default_topic)
# no_below = 20
# no_above = 0.5
path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words))
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
# n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
top_topic_words = 15
path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words))
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
top_topic_words = 20
path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words))
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
"""
else:
# build dictionary of ticketcategories
labelist = []
for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
labelist.append(texdoc.metadata["categoryName"])
labeldict = {k: v for v, k in enumerate(labelist)}
textacyTopicModeling(ngrams = 1,
min_df = 1,
@ -372,7 +404,7 @@ def main(use_raw=False, algorithm="llda"):
topicModel = algorithm,
n_topics =15,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams=1,
min_df=1,
max_df=0.9,
@ -394,7 +426,7 @@ def main(use_raw=False, algorithm="llda"):
topicModel=algorithm,
n_topics=30,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams=(1, 2),
@ -403,7 +435,7 @@ def main(use_raw=False, algorithm="llda"):
topicModel=algorithm,
n_topics=15,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.9,
@ -425,59 +457,7 @@ def main(use_raw=False, algorithm="llda"):
topicModel = algorithm,
n_topics =30,
corpus=de_corpus)
"""
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 0.8,
topicModel = algorithm,
n_topics =n_topics,
corpus=de_corpus)
"""
"""
textacyTopicModeling(ngrams = 1,
min_df = 0.1,
max_df = 0.6,
topicModel = algorithm,
n_topics =n_topics,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 1,
max_df = 1.0,
topicModel = algorithm,
n_topics =n_topics,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 0.1,
max_df = 0.6,
topicModel = algorithm,
n_topics =n_topics,
corpus=de_corpus)
textacyTopicModeling(ngrams = (1,2),
min_df = 0.2,
max_df = 0.8,
topicModel = algorithm,
n_topics = 20,
corpus=de_corpus)
"""

20561
vornamen.txt

File diff suppressed because it is too large Load Diff