lauffähige version
This commit is contained in:
parent
ecc8c0c54a
commit
0a6a68b8aa
383
backup.py
383
backup.py
|
@ -1,383 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
############# misc
|
||||
|
||||
def printlog(string, level="INFO"):
|
||||
"""log and prints"""
|
||||
print(string)
|
||||
if level=="INFO":
|
||||
logging.info(string)
|
||||
elif level=="DEBUG":
|
||||
logging.debug(string)
|
||||
elif level == "WARNING":
|
||||
logging.warning(string)
|
||||
printlog("Load functions")
|
||||
|
||||
def compose(*functions):
|
||||
def compose2(f, g):
|
||||
return lambda x: f(g(x))
|
||||
return functools.reduce(compose2, functions, lambda x: x)
|
||||
|
||||
def get_calling_function():
|
||||
"""finds the calling function in many decent cases.
|
||||
https://stackoverflow.com/questions/39078467/python-how-to-get-the-calling-function-not-just-its-name
|
||||
"""
|
||||
fr = sys._getframe(1) # inspect.stack()[1][0]
|
||||
co = fr.f_code
|
||||
for get in (
|
||||
lambda:fr.f_globals[co.co_name],
|
||||
lambda:getattr(fr.f_locals['self'], co.co_name),
|
||||
lambda:getattr(fr.f_locals['cls'], co.co_name),
|
||||
lambda:fr.f_back.f_locals[co.co_name], # nested
|
||||
lambda:fr.f_back.f_locals['func'], # decorators
|
||||
lambda:fr.f_back.f_locals['meth'],
|
||||
lambda:fr.f_back.f_locals['f'],
|
||||
):
|
||||
try:
|
||||
func = get()
|
||||
except (KeyError, AttributeError):
|
||||
pass
|
||||
else:
|
||||
if func.__code__ == co:
|
||||
return func
|
||||
raise AttributeError("func not found")
|
||||
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
import random
|
||||
print()
|
||||
|
||||
printlog("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
printlog("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
|
||||
|
||||
print()
|
||||
|
||||
############# load xml
|
||||
def generateMainTextfromTicketXML(path2xml, main_textfield='Description'):
|
||||
"""
|
||||
generates strings from XML
|
||||
:param path2xml:
|
||||
:param main_textfield:
|
||||
:param cleaning_function:
|
||||
:yields strings
|
||||
"""
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
root = tree.getroot()
|
||||
|
||||
for ticket in root:
|
||||
for field in ticket:
|
||||
if field.tag == main_textfield:
|
||||
yield field.text
|
||||
|
||||
def generateMetadatafromTicketXML(path2xml, leave_out=['Description']):
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
root = tree.getroot()
|
||||
|
||||
for ticket in root:
|
||||
metadata = {}
|
||||
for field in ticket:
|
||||
if field.tag not in leave_out:
|
||||
|
||||
metadata[field.tag] = field.text
|
||||
|
||||
yield metadata
|
||||
|
||||
|
||||
############# load csv
|
||||
|
||||
def csv_to_contentStream(path2csv: str, content_collumn_name: str):
|
||||
"""
|
||||
:param path2csv: string
|
||||
:param content_collumn_name: string
|
||||
:return: string-generator
|
||||
"""
|
||||
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
||||
content_collumn = 0 # standardvalue
|
||||
|
||||
for i,lst in enumerate(stream):
|
||||
if i == 0:
|
||||
# look for desired column
|
||||
for j,col in enumerate(lst):
|
||||
if col == content_collumn_name:
|
||||
content_collumn = j
|
||||
else:
|
||||
yield lst[content_collumn]
|
||||
|
||||
def csv_to_metaStream(path2csv: str, metalist: [str]):
|
||||
"""
|
||||
:param path2csv: string
|
||||
:param metalist: list of strings
|
||||
:return: dict-generator
|
||||
"""
|
||||
stream = textacy.fileio.read_csv(path2csv, delimiter=";") # ,encoding='utf8')
|
||||
|
||||
content_collumn = 0 # standardvalue
|
||||
metaindices = []
|
||||
metadata_temp = {}
|
||||
for i,lst in enumerate(stream):
|
||||
if i == 0:
|
||||
for j,col in enumerate(lst): # geht bestimmt effizienter... egal, weil passiert nur einmal
|
||||
for key in metalist:
|
||||
if key == col:
|
||||
metaindices.append(j)
|
||||
metadata_temp = dict(zip(metalist,metaindices)) # zB {'Subject' : 1, 'categoryName' : 3, 'Solution' : 10}
|
||||
|
||||
else:
|
||||
metadata = metadata_temp.copy()
|
||||
for key,value in metadata.items():
|
||||
metadata[key] = lst[value]
|
||||
yield metadata
|
||||
|
||||
|
||||
|
||||
############################################ Preprocessing ##############################################
|
||||
|
||||
|
||||
############# on str-gen
|
||||
|
||||
def processTokens(tokens, funclist, parser):
|
||||
# in:tokenlist, funclist
|
||||
# out: tokenlist
|
||||
for f in funclist:
|
||||
# idee: funclist sortieren,s.d. erst alle string-methoden ausgeführt werden, dann wird geparesed, dann wird auf tokens gearbeitet, dann evtl. auf dem ganzen Doc
|
||||
|
||||
if 'bool' in str(f.__annotations__):
|
||||
tokens = list(filter(f, tokens))
|
||||
|
||||
elif 'str' in str(f.__annotations__):
|
||||
tokens = list(map(f, tokens)) # purer text
|
||||
doc = parser(" ".join(tokens)) # neu parsen
|
||||
tokens = [tok for tok in doc] # nur tokens
|
||||
|
||||
elif 'spacy.tokens.doc.Doc' in str(f.__annotations__):
|
||||
#todo wirkt gefrickelt
|
||||
doc = parser(" ".join(tok.lower_ for tok in tokens)) # geparsed
|
||||
tokens = f(doc)
|
||||
doc = parser(" ".join(tokens)) # geparsed
|
||||
tokens = [tok for tok in doc] # nur tokens
|
||||
else:
|
||||
warnings.warn("Unknown Annotation while preprocessing. Function: {0}".format(str(f)))
|
||||
|
||||
return tokens
|
||||
|
||||
def processTextstream(textstream, funclist, parser=DE_PARSER):
|
||||
"""
|
||||
:param textstream: string-gen
|
||||
:param funclist: [func]
|
||||
:param parser: spacy-parser
|
||||
:return: string-gen
|
||||
"""
|
||||
# input:str-stream output:str-stream
|
||||
pipe = parser.pipe(textstream)
|
||||
|
||||
for doc in pipe:
|
||||
|
||||
tokens = []
|
||||
for tok in doc:
|
||||
tokens.append(tok)
|
||||
|
||||
tokens = processTokens(tokens,funclist,parser)
|
||||
yield " ".join([tok.lower_ for tok in tokens])
|
||||
|
||||
def processDictstream(dictstream, funcdict, parser=DE_PARSER):
|
||||
"""
|
||||
|
||||
:param dictstream: dict-gen
|
||||
:param funcdict:
|
||||
clean_in_meta = {
|
||||
"Solution":funclist,
|
||||
...
|
||||
}
|
||||
|
||||
:param parser: spacy-parser
|
||||
:return: dict-gen
|
||||
"""
|
||||
for dic in dictstream:
|
||||
result = {}
|
||||
for key, value in dic.items():
|
||||
|
||||
if key in funcdict:
|
||||
|
||||
doc = parser(value)
|
||||
tokens = [tok for tok in doc]
|
||||
funclist = funcdict[key]
|
||||
|
||||
tokens = processTokens(tokens,funclist,parser)
|
||||
|
||||
|
||||
result[key] = " ".join([tok.lower_ for tok in tokens])
|
||||
|
||||
|
||||
else:
|
||||
result[key] = value
|
||||
yield result
|
||||
|
||||
|
||||
############# return bool
|
||||
|
||||
def keepPOS(pos_list) -> bool:
|
||||
ret = lambda tok : tok.pos_ in pos_list
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def removePOS(pos_list)-> bool:
|
||||
ret = lambda tok : tok.pos_ not in pos_list
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def removeWords(words, keep=None)-> bool:
|
||||
if hasattr(keep, '__iter__'):
|
||||
for k in keep:
|
||||
try:
|
||||
words.remove(k)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
ret = lambda tok : tok.lower_ not in words
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def keepENT(ent_list) -> bool:
|
||||
ret = lambda tok : tok.ent_type_ in ent_list
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def removeENT(ent_list) -> bool:
|
||||
ret = lambda tok: tok.ent_type_ not in ent_list
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def remove_words_containing_Numbers() -> bool:
|
||||
ret = lambda tok: not bool(re.search('\d', tok.lower_))
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
def remove_words_containing_specialCharacters() -> bool:
|
||||
ret = lambda tok: not bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', tok.lower_))
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
def remove_words_containing_topLVL() -> bool:
|
||||
ret = lambda tok: not bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', tok.lower_))
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
def lemmatizeWord(word,filepath=LEMMAS):
|
||||
"""http://www.lexiconista.com/datasets/lemmatization/"""
|
||||
for line in list(textacy.fileio.read_file_lines(filepath=filepath)):
|
||||
if word.lower() == line.split()[1].strip().lower():
|
||||
return line.split()[0].strip().lower()
|
||||
return word.lower() # falls nix gefunden wurde
|
||||
|
||||
def lemmatize() -> str:
|
||||
ret = lambda tok: lemmatizeWord(tok.lower_)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
############# return strings
|
||||
|
||||
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
||||
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
||||
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
||||
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
|
||||
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
|
||||
hardSFinder = re.compile(r'[ß]', re.IGNORECASE)
|
||||
|
||||
|
||||
|
||||
def replaceEmails(replace_with="EMAIL") -> str:
|
||||
ret = lambda tok : emailFinder.sub(replace_with, tok.lower_)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def replaceURLs(replace_with="URL") -> str:
|
||||
ret = lambda tok: textacy.preprocess.replace_urls(tok.lower_,replace_with=replace_with)
|
||||
#ret = lambda tok: urlFinder.sub(replace_with,tok.lower_)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def replaceSpecialChars(replace_with=" ") -> str:
|
||||
ret = lambda tok: specialFinder.sub(replace_with,tok.lower_)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
def replaceTwitterMentions(replace_with="TWITTER_MENTION") -> str:
|
||||
ret = lambda tok : mentionFinder.sub(replace_with,tok.lower_)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def replaceNumbers(replace_with="NUMBER") -> str:
|
||||
ret = lambda tok: textacy.preprocess.replace_numbers(tok.lower_, replace_with=replace_with)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def replacePhonenumbers(replace_with="PHONENUMBER") -> str:
|
||||
ret = lambda tok: textacy.preprocess.replace_phone_numbers(tok.lower_, replace_with=replace_with)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def replaceHardS(replace_with="ss") -> str:
|
||||
ret = lambda tok: hardSFinder.sub(replace_with,tok.lower_)
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
def fixUnicode() -> str:
|
||||
ret = lambda tok: textacy.preprocess.fix_bad_unicode(tok.lower_, normalization=u'NFC')
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def resolveAbbreviations():
|
||||
pass #todo
|
||||
|
||||
#todo wörter mit len < 2 entfernen( vorher abkürzungen (v.a. tu und fh) auflösen) und > 35 oder 50 ("Reiserücktrittskostenversicherung)
|
||||
|
||||
############# return docs
|
||||
|
||||
def keepUniqeTokens() -> spacy.tokens.Doc:
|
||||
ret = lambda doc: (set([tok.lower_ for tok in doc]))
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
def lower() -> spacy.tokens.Doc:
|
||||
ret = lambda doc: ([tok.lower_ for tok in doc])
|
||||
|
||||
ret.__annotations__ = get_calling_function().__annotations__
|
||||
return ret
|
||||
|
||||
|
||||
################################################################################################################
|
36
cleaning.py
36
cleaning.py
|
@ -11,6 +11,9 @@ from scipy import *
|
|||
|
||||
import os
|
||||
|
||||
from preprocessing import removePOS
|
||||
from preprocessing import filterTokens
|
||||
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||
|
||||
|
@ -24,11 +27,6 @@ with open(config_ini) as f:
|
|||
config.read_file(f)
|
||||
|
||||
|
||||
global REGEX_SPECIALCHAR
|
||||
|
||||
global WORDS
|
||||
|
||||
|
||||
REGEX_SPECIALCHAR = r'[`\=~%^&*()_+\[\]{};\'"|</>]' #+r',.-\\:' #+r',.?!'
|
||||
|
||||
WORDS= {}
|
||||
|
@ -113,15 +111,12 @@ def clean(stringstream,autocorrect=False):
|
|||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
# frage autocorrect?
|
||||
#idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf
|
||||
#frage autocorrect? idee http://lexitron.nectec.or.th/public/COLING-2010_Beijing_China/POSTERS/pdf/POSTERS022.pdf
|
||||
if autocorrect:
|
||||
string = " ".join([autocorrectWord(word) for word in string.split()])
|
||||
|
||||
yield string
|
||||
|
||||
|
||||
|
||||
def processDictstream(dictstream, funcdict, parser):
|
||||
"""
|
||||
|
||||
|
@ -154,30 +149,21 @@ def processDictstream(dictstream, funcdict, parser):
|
|||
result[key] = value
|
||||
yield result
|
||||
|
||||
def filterTokens(tokens, funclist):
|
||||
# in:tokenlist, funclist
|
||||
# out: tokenlist
|
||||
for f in funclist:
|
||||
tokens = list(filter(f, tokens))
|
||||
|
||||
return tokens
|
||||
|
||||
def removePOS(pos_list):
|
||||
return lambda tok: tok.pos_ not in pos_list
|
||||
|
||||
##################################################################################################
|
||||
|
||||
ressources_path = FILEPATH + "ressources/"
|
||||
|
||||
path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")
|
||||
path2wordsdict = ressources_path + config.get("spellchecking", "pickle_file")
|
||||
|
||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||
|
||||
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||
|
||||
autocorrect = config.getboolean("preprocessing", "autocorrect")
|
||||
|
||||
|
||||
|
||||
def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
|
||||
def cleanCorpus(corpus_path, clean_in_meta, lang="de", printrandom=10,autocorrect=False):
|
||||
|
||||
logprint("Clean {0}_corpus at {1}".format(lang, datetime.now()))
|
||||
|
||||
|
@ -192,7 +178,7 @@ def cleanCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrando
|
|||
|
||||
## process and add files to textacy-corpi,
|
||||
clean_corpus.add_texts(
|
||||
clean(corpus2Text(raw_corpus)),
|
||||
clean(corpus2Text(raw_corpus),autocorrect=autocorrect),
|
||||
processDictstream(corpus2Meta(raw_corpus), clean_in_meta,parser=parser)
|
||||
)
|
||||
|
||||
|
@ -220,8 +206,6 @@ def main():
|
|||
|
||||
WORDS = load_obj(path2wordsdict)
|
||||
|
||||
clean_in_content = [] #frage notwendig?
|
||||
|
||||
|
||||
clean_in_meta = {
|
||||
"Solution": [removePOS(["SPACE"])],
|
||||
|
@ -229,7 +213,7 @@ def main():
|
|||
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
||||
}
|
||||
|
||||
corpus = cleanCorpus(corpus_de_path, clean_in_content, clean_in_meta, "de",printrandom=5 )
|
||||
corpus = cleanCorpus(corpus_de_path, clean_in_meta, "de",printrandom=5, autocorrect=autocorrect )
|
||||
|
||||
end = time.time()
|
||||
logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))
|
||||
|
|
|
@ -1,24 +0,0 @@
|
|||
Index: 0
|
||||
Text: lieber support, ich habe gerade versucht mich mit meiner unicard im firefox browser fuer das service portal zu authentifizieren. das hat vor einigen wochen noch tadelos geklappt und mittlerweile bekomme ich folgende fehlermeldung ich hoffe sie koennen mir weiterhelfen. vielen dank und viele gruesse sascha feldhorst dipl. inform. sascha feldhorst wiss. ang. technische universitaet dortmund maschinenbau lehrstuhl fuer foerder und lagerwesen logistikcampus joseph von fraunhofer str. 2 4 d 44227 dortmund tel. 49 231 755 40 73 fax 49 231 755 47 68 mailto sascha.feldhorst@tu dortmund.de sascha.feldhorst@tu dortmund.de http www.flw.mb.tu dortmund.de www.flw.mb.tu dortmund.de wichtiger hinweis die information in dieser e mail ist vertraulich. sie ist ausschliesslich fuer den adressaten bestimmt. sollten sie nicht der fuer diese e mail bestimmte adressat sein, unterrichten sie bitte den absender und vernichten sie diese mail. vielen dank. unbeschadet der korrespondenz per e mail, sind unsere erklaerungen ausschliesslich final rechtsverbindlich, wenn sie in herkoemmlicher schriftform mit eigenhaendiger unterschrift oder durch uebermittlung eines solchen schriftstuecks per telefax erfolgen. important note the information included in this e mail is confidential. it is solely intended for the recipient. if you are not the intended recipient of this e mail please contact the sender and delete this message. thank you. without prejudice of e mail correspondence, our statements are only legally binding when they are made in the conventional written form with personal signature or when such documents are sent by fax.
|
||||
categoryName: betrieb
|
||||
|
||||
Index: 0
|
||||
Text: support browser service portal mittlerweile
|
||||
categoryName: betrieb
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Index: 1
|
||||
Text: telefon umzug antragsteller astrid gramm astrid.gramm@tu dortmund.de terminvorschlag 14.08.2015 einrichtung dezernat 2 abteilung 2.5 psp element uniaccount mnichofm hofmann, nicole gebaeude dezernat 5 raum id 201 651430 telefondose neztwerkdose dt04.5 04.6 telefonnr. 4821 eintrag telefonbuch e mail astrid.gramm@tu dortmund.de voicemail ansprechpartner astrid gramm tel. ansprechpartner 5444 verantwortlicher nutzer type bemerkung frau hofmann wird am 14.08.2015 in die wd 2 umziehen. es ist der raum 201a im og nicht 201 eine bezeichnung der telefondose ist nicht vorhanden.
|
||||
categoryName: elektronisches telefonbuch
|
||||
|
||||
Index: 1
|
||||
Text: telefon umzug antragsteller gramm einrichtung dezernat abteilung element gebaeude dezernat raum id eintrag telefonbuch mail ansprechpartner gramm ansprechpartner verantwortlicher nutzer type bemerkung raum bezeichnung
|
||||
categoryName: elektronisches telefonbuch
|
11
config.ini
11
config.ini
|
@ -37,12 +37,12 @@ pickle_file=en_stopwords_list.pkl
|
|||
|
||||
[logging]
|
||||
level=INFO
|
||||
filename=topicModelTickets.log
|
||||
filename=log/topicModelTickets.log
|
||||
|
||||
|
||||
[de_corpus]
|
||||
input=M42-Export/Tickets_small.csv
|
||||
#input=M42-Export/de_tickets.csv
|
||||
#input=M42-Export/Tickets_small.csv
|
||||
input=M42-Export/de_tickets.csv
|
||||
|
||||
path=corpi/
|
||||
|
||||
|
@ -64,7 +64,10 @@ metaliste=TicketNumber,Subject,CreatedDate,categoryName,Impact,Urgency,BenutzerI
|
|||
|
||||
#ents2keep=WORK_OF_ART,ORG,PRODUCT,LOC
|
||||
|
||||
custom_words=eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok
|
||||
autocorrect = false
|
||||
#true
|
||||
|
||||
custom_words=aenderung,hahn,verantwortlicher,rolle,status,fehlgeschlagen,aenderung,test,erwuenscht,antragsteller,bemerkung,tu,uni,prof,bezeichnung,gramm,type,eintrag,element,nutzer,einrichtung,abteilung,gebaeude,raum,ansprechpartner,geehrt,dr,not,frage,betreff,gerne,dame,herr,frau,hilfe,moeglichkeit,beste,freuen,voraus,problem,lauten,bedanken,voraus,hallo,gerne,freundlich,fragen,fehler,bitten,ehre,lieb,liebe,gruesse,helfen,versuchen,unbestimmt,woche,tadelos,klappen,mittlerweile,bekommen,erreichbar,gruss,auffahren,vorgang,hinweis,name,gruss,id,erfolg,folge,team,absender,versenden,vorname,strasse,prozess,portal,moeglichkeit,fremd,wende,rueckfrage,stehen,verfuegung,funktionieren,pruefen,hoffen,ok,januar,februar,maerz,april,mai,juni,juli,august,september,oktober,november,dezember
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -23,8 +23,6 @@ with open(config_ini) as f:
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
def ticketcsv_to_textStream(path2csv: str, content_collumn_name: str):
|
||||
"""
|
||||
:param path2csv: string
|
||||
|
@ -75,27 +73,9 @@ def ticket_csv_to_DictStream(path2csv: str, metalist: [str]):
|
|||
##################################################################################################
|
||||
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/corporization.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_corporization.log &"
|
||||
|
||||
|
||||
"""
|
||||
content_collumn_name = "Description"
|
||||
metaliste = [
|
||||
"TicketNumber",
|
||||
"Subject",
|
||||
"CreatedDate",
|
||||
"categoryName",
|
||||
"Impact",
|
||||
"Urgency",
|
||||
"BenutzerID",
|
||||
"VerantwortlicherID",
|
||||
"EigentuemerID",
|
||||
"Solution"
|
||||
]
|
||||
"""
|
||||
|
||||
content_collumn_name = config.get("tickets","content_collumn_name")
|
||||
metaliste = list(map(normalize_whitespace,config.get("tickets","metaliste").split(",")))
|
||||
metaliste = get_list_from_config("tickets","metaliste")
|
||||
|
||||
|
||||
path2de_csv = FILEPATH + config.get("de_corpus","input")
|
||||
|
@ -110,7 +90,6 @@ corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
|||
def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, lang, printrandom=0):
|
||||
|
||||
|
||||
|
||||
# print paths
|
||||
path_csv_split = path2_csv.split("/")
|
||||
filename = path_csv_split[len(path_csv_split) - 1]
|
||||
|
@ -121,8 +100,6 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
|
|||
raw_corpus = textacy.Corpus(lang)
|
||||
|
||||
## add files to textacy-corpi,
|
||||
#printlog("Add texts to {0}_textacy-corpi".format(lang))
|
||||
|
||||
raw_corpus.add_texts(
|
||||
ticketcsv_to_textStream(path2_csv, content_collumn_name),
|
||||
ticket_csv_to_DictStream(path2_csv, metaliste)
|
||||
|
@ -132,6 +109,7 @@ def ticketcsv2Corpus(path2_csv, corpus_path, content_collumn_name, metaliste, la
|
|||
# leere docs aus corpi kicken
|
||||
raw_corpus.remove(lambda doc: len(doc) == 0)
|
||||
|
||||
logprint("corpus-lenght: {}".format(len(raw_corpus)))
|
||||
#random Doc printen
|
||||
for i in range(printrandom):
|
||||
printRandomDoc(raw_corpus)
|
||||
|
|
1855
german_stopwords.txt
1855
german_stopwords.txt
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
33
init.py
33
init.py
|
@ -237,36 +237,37 @@ def build_words_for_spellchecking(path2words):
|
|||
##################################################################################################
|
||||
|
||||
# THESAURUS
|
||||
path2wordnet = FILEPATH + config.get("thesaurus","input")
|
||||
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
|
||||
ressources_path = FILEPATH + "ressources/"
|
||||
path2wordnet = ressources_path + config.get("thesaurus","input")
|
||||
path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file")
|
||||
|
||||
|
||||
# SPELLCHECKING
|
||||
path2words_file = FILEPATH + config.get("spellchecking","input")
|
||||
path2wordlist = FILEPATH + config.get("spellchecking","pickle_file")
|
||||
path2words_file = ressources_path + config.get("spellchecking","input")
|
||||
path2wordlist = ressources_path + config.get("spellchecking","pickle_file")
|
||||
|
||||
|
||||
# LEMMA
|
||||
path2lemma_file = FILEPATH + config.get("lemmatization","input")
|
||||
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
|
||||
path2lemma_file = ressources_path + config.get("lemmatization","input")
|
||||
path2lemmadict = ressources_path + config.get("lemmatization","pickle_file")
|
||||
|
||||
# NOMEN
|
||||
nouns1 = FILEPATH + config.get("nouns","input1")
|
||||
nouns2 = FILEPATH + config.get("nouns","input2")
|
||||
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
|
||||
nouns1 = ressources_path + config.get("nouns","input1")
|
||||
nouns2 = ressources_path + config.get("nouns","input2")
|
||||
path2nouns_list = ressources_path + config.get("nouns","pickle_file")
|
||||
|
||||
|
||||
# VORNAMEN
|
||||
firstnames_txt = FILEPATH + config.get("firstnames","input")
|
||||
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
|
||||
firstnames_txt = ressources_path + config.get("firstnames","input")
|
||||
path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
|
||||
|
||||
# STOPWORDS
|
||||
stop1 = FILEPATH + config.get("de_stopwords","input1")
|
||||
stop2 = FILEPATH + config.get("de_stopwords","input2")
|
||||
stop3 = FILEPATH + config.get("de_stopwords","input3")
|
||||
path2stopwordlist_de = FILEPATH + config.get("de_stopwords","pickle_file")
|
||||
stop1 = ressources_path + config.get("de_stopwords","input1")
|
||||
stop2 = ressources_path + config.get("de_stopwords","input2")
|
||||
stop3 = ressources_path + config.get("de_stopwords","input3")
|
||||
path2stopwordlist_de = ressources_path + config.get("de_stopwords","pickle_file")
|
||||
|
||||
path2stopwordlist_en = FILEPATH + config.get("en_stopwords","pickle_file")
|
||||
path2stopwordlist_en = ressources_path + config.get("en_stopwords","pickle_file")
|
||||
|
||||
|
||||
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
358474
lemmatization-de.txt
358474
lemmatization-de.txt
File diff suppressed because it is too large
Load Diff
654747
lexicalentries.xml
654747
lexicalentries.xml
File diff suppressed because it is too large
Load Diff
21
main.py
21
main.py
|
@ -11,12 +11,12 @@ import cleaning
|
|||
|
||||
from miscellaneous import *
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_main.log &"
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/main.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/log/printout_main.log &"
|
||||
start = time.time()
|
||||
|
||||
|
||||
|
||||
#init.main()
|
||||
init.main()
|
||||
logprint("")
|
||||
|
||||
corporization.main()
|
||||
|
@ -30,32 +30,23 @@ logprint("")
|
|||
|
||||
|
||||
"""
|
||||
topicModeling.main(use_raw=False,algorithm="lsa")
|
||||
#topicModeling.main(use_cleaned=False,algorithm="lsa")
|
||||
logprint("")
|
||||
|
||||
|
||||
topicModeling.main(use_raw=False,algorithm="lda")
|
||||
#topicModeling.main(use_cleaned=False,algorithm="nmf")
|
||||
logprint("")
|
||||
|
||||
|
||||
topicModeling.main(use_raw=False,algorithm="nmf")
|
||||
#topicModeling.main(use_cleaned=False,algorithm="lda")
|
||||
logprint("")
|
||||
|
||||
|
||||
topicModeling.main(use_raw=False,algorithm="llda")
|
||||
topicModeling.main(use_cleaned=False,algorithm="llda")
|
||||
logprint("")
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
logprint("")
|
||||
|
||||
end = time.time()
|
||||
logprint("Total Time Elapsed: {0} min".format((end - start) / 60))
|
||||
|
||||
|
|
112
miscellaneous.py
112
miscellaneous.py
|
@ -153,6 +153,25 @@ def printRandomDoc(textacyCorpus):
|
|||
|
||||
print()
|
||||
|
||||
def get_list_from_config(section,option):
|
||||
return list(map(textacy.preprocess.normalize_whitespace,config.get(section,option).split(",")))
|
||||
|
||||
def corpus2Text(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.text
|
||||
|
||||
def corpus2Meta(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.metadata
|
||||
|
||||
def savelabledCorpiLines(corpus,filepath):
|
||||
|
||||
textacy.fileio.write_file_lines(gen_labledLines(corpus), filepath=filepath)
|
||||
|
||||
def gen_labledLines(corpus):
|
||||
for doc in corpus:
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||
yield "[" + doc.metadata["categoryName"] + "] " + doc.text
|
||||
|
||||
|
||||
def save_corpus(corpus, corpus_path, corpus_name):
|
||||
|
@ -219,95 +238,6 @@ def load_corpus(corpus_path, corpus_name, lang="de"):
|
|||
for key,value in plain.items():
|
||||
if key != "content" and key != "index":
|
||||
meta[key] = value
|
||||
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
|
||||
|
||||
return corpus, corpus.spacy_lang
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
def corpus2Text(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.text
|
||||
|
||||
def corpus2Meta(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.metadata
|
||||
|
||||
def saveplaincorpustext(corpus,path):
|
||||
textacy.fileio.write_file_lines(corpus2Text(corpus),filepath=path )
|
||||
|
||||
def save_corpusV2(corpus, corpus_path, corpus_name):
|
||||
|
||||
|
||||
# save parser
|
||||
parser = corpus.spacy_lang
|
||||
parserpath = corpus_path + str(parser.lang) + '_parser'
|
||||
parser.save_to_directory(parserpath)
|
||||
|
||||
|
||||
contentpath = corpus_path +corpus_name + "_docs/"
|
||||
if not os.path.exists(contentpath):
|
||||
os.makedirs(contentpath)
|
||||
|
||||
for doc in corpus:
|
||||
with open(contentpath + str(doc.corpus_index) + "_doc.bin", 'w') as f:
|
||||
f.write(doc.spacy_doc.to_bytes())
|
||||
with open(contentpath + str(doc.corpus_index) + "_meta.json", 'w') as file:
|
||||
file.write(json.dumps(doc.metadata))
|
||||
|
||||
def load_corpusV2(corpus_path, corpus_name, lang="de"):
|
||||
|
||||
|
||||
# ckeck for language
|
||||
if "de_" in corpus_name:
|
||||
lang = "de"
|
||||
elif "en_" in corpus_name:
|
||||
lang = "en"
|
||||
|
||||
# load parser
|
||||
parser = spacy.load(lang)
|
||||
|
||||
stringstorepath = corpus_path + str(lang) + '_parser' + '/vocab/strings.json'
|
||||
with open(stringstorepath) as file:
|
||||
parser.vocab.strings.load(file)
|
||||
|
||||
vocabpath = Path(corpus_path + str(lang) + '_parser' + '/vocab/lexemes.bin')
|
||||
parser.vocab.load_lexemes(vocabpath)
|
||||
|
||||
# load corpus
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
contentpath = corpus_path + corpus_name + "_docs/"
|
||||
docs = yield_fromdir(contentpath,spacy_vocab=corpus.spacy_vocab,type="doc")
|
||||
metas = yield_fromdir(contentpath,type="meta")
|
||||
|
||||
for doc,meta in zip(docs,metas):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(doc, lang=corpus.spacy_lang, metadata=meta))
|
||||
|
||||
|
||||
return corpus, corpus.spacy_lang
|
||||
|
||||
def yield_fromdir(path,spacy_vocab=None,type=".pkl"):
|
||||
os.chdir(path)
|
||||
filelist = [name for name in os.listdir('.') if os.path.isfile(name)]
|
||||
filelist = [filename for filename in filelist if type in filename]
|
||||
filelist.sort(key = lambda elem : elem.split("_")[0])
|
||||
|
||||
|
||||
if type =='doc':
|
||||
for filename in filelist:
|
||||
with open(path+filename,'r') as f:
|
||||
for bytes_string in SpacyDoc.read_bytes(f):
|
||||
yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
|
||||
elif type == 'meta':
|
||||
for filename in filelist:
|
||||
with open(path+filename,'r') as f:
|
||||
yield json.load(f)
|
||||
else:
|
||||
for filename in filelist:
|
||||
yield load_obj(path+filename)
|
||||
"""
|
||||
corpus.add_doc(textacy.Doc(plain["content"], lang=corpus.spacy_lang, metadata=meta))
|
||||
|
||||
return corpus, corpus.spacy_lang
|
|
@ -1,466 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import csv
|
||||
import random
|
||||
import sys
|
||||
|
||||
import spacy
|
||||
import textacy
|
||||
|
||||
"""
|
||||
import keras
|
||||
import numpy as np
|
||||
from keras.layers import Dense, SimpleRNN, LSTM, TimeDistributed, Dropout
|
||||
from keras.models import Sequential
|
||||
import keras.backend as K
|
||||
"""
|
||||
csv.field_size_limit(sys.maxsize)
|
||||
|
||||
"""
|
||||
def getFirstSynonym(word, thesaurus_gen):
|
||||
|
||||
word = word.lower()
|
||||
# TODO word cleaning https://stackoverflow.com/questions/3939361/remove-specific-characters-from-a-string-in-python
|
||||
|
||||
|
||||
# durch den thesaurrus iterieren
|
||||
for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
|
||||
|
||||
# durch den synonymblock iterieren
|
||||
for syn in syn_block:
|
||||
syn = syn.lower().split(" ") if not re.match(r'\A[\w-]+\Z', syn) else syn # aus synonym mach liste (um evtl. sätze zu identifieziren)
|
||||
|
||||
# falls das wort in dem synonym enthalten ist (also == einem Wort in der liste ist)
|
||||
if word in syn:
|
||||
|
||||
# Hauptform suchen
|
||||
if "auptform" in syn:
|
||||
# nicht ausgeben, falls es in Klammern steht
|
||||
for w in syn:
|
||||
if not re.match(r'\([^)]+\)', w) and w is not None:
|
||||
return w
|
||||
|
||||
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
|
||||
if len(syn) == 1:
|
||||
w = syn[0]
|
||||
if not re.match(r'\([^)]+\)', w) and w is not None:
|
||||
return w
|
||||
|
||||
return word # zur Not die eingabe ausgeben
|
||||
|
||||
|
||||
"""
|
||||
"""
|
||||
def cleanText(string,custom_stopwords=None, custom_symbols=None, custom_words=None, customPreprocessing=None, lemmatize=False, normalize_synonyms=False):
|
||||
|
||||
# use preprocessing
|
||||
if customPreprocessing is not None:
|
||||
string = customPreprocessing(string)
|
||||
|
||||
|
||||
|
||||
if custom_stopwords is not None:
|
||||
custom_stopwords = custom_stopwords
|
||||
else:
|
||||
custom_stopwords = []
|
||||
|
||||
if custom_words is not None:
|
||||
custom_words = custom_words
|
||||
else:
|
||||
custom_words = []
|
||||
|
||||
if custom_symbols is not None:
|
||||
custom_symbols = custom_symbols
|
||||
else:
|
||||
custom_symbols = []
|
||||
|
||||
|
||||
# custom stoplist
|
||||
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
|
||||
stop_words = __import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS
|
||||
|
||||
stoplist =list(stop_words) + custom_stopwords
|
||||
# List of symbols we don't care about either
|
||||
symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols
|
||||
|
||||
|
||||
|
||||
# get rid of newlines
|
||||
string = string.strip().replace("\n", " ").replace("\r", " ")
|
||||
|
||||
# replace twitter
|
||||
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
||||
string = mentionFinder.sub("MENTION", string)
|
||||
|
||||
# replace emails
|
||||
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
||||
string = emailFinder.sub("EMAIL", string)
|
||||
|
||||
# replace urls
|
||||
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
||||
string = urlFinder.sub("URL", string)
|
||||
|
||||
# replace HTML symbols
|
||||
string = string.replace("&", "and").replace(">", ">").replace("<", "<")
|
||||
|
||||
|
||||
|
||||
|
||||
# parse with spaCy
|
||||
spacy_doc = PARSER(string)
|
||||
tokens = []
|
||||
|
||||
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
|
||||
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #IDEE NUM mit in den Corpus aufnehmen, aber fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
|
||||
|
||||
# append Tokens to a list
|
||||
for tok in spacy_doc:
|
||||
if tok.pos_ in added_POS:
|
||||
if lemmatize:
|
||||
tokens.append(tok.lemma_.lower().strip())
|
||||
else:
|
||||
tokens.append(tok.text.lower().strip())
|
||||
|
||||
# add entities
|
||||
if tok.ent_type_ in added_entities:
|
||||
tokens.append(tok.text.lower())
|
||||
|
||||
|
||||
|
||||
# remove stopwords
|
||||
tokens = [tok for tok in tokens if tok not in stoplist]
|
||||
|
||||
# remove symbols
|
||||
tokens = [tok for tok in tokens if tok not in symbols]
|
||||
|
||||
# remove custom_words
|
||||
tokens = [tok for tok in tokens if tok not in custom_words]
|
||||
|
||||
# remove single characters
|
||||
tokens = [tok for tok in tokens if len(tok)>1]
|
||||
|
||||
# remove large strings of whitespace
|
||||
remove_large_strings_of_whitespace(" ".join(tokens))
|
||||
|
||||
|
||||
#idee abkürzungen auflösen (v.a. TU -> Technische Universität)
|
||||
|
||||
if normalize_synonyms:
|
||||
tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
|
||||
|
||||
return " ".join(tokens)
|
||||
|
||||
|
||||
def remove_large_strings_of_whitespace(sentence):
|
||||
|
||||
whitespaceFinder = re.compile(r'(\r\n|\r|\n)', re.IGNORECASE)
|
||||
sentence = whitespaceFinder.sub(" ", sentence)
|
||||
|
||||
tokenlist = sentence.split(" ")
|
||||
|
||||
while "" in tokenlist:
|
||||
tokenlist.remove("")
|
||||
while " " in tokenlist:
|
||||
tokenlist.remove(" ")
|
||||
|
||||
return " ".join(tokenlist)
|
||||
"""
|
||||
"""
|
||||
def generateFromXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False,lemmatize=False):
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
root = tree.getroot()
|
||||
|
||||
for ticket in root:
|
||||
metadata = {}
|
||||
text = "ERROR"
|
||||
for field in ticket:
|
||||
if field.tag == textfield:
|
||||
if clean:
|
||||
text = cleanText_words(field.text,PARSER,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
|
||||
else:
|
||||
text = field.text
|
||||
else:
|
||||
#idee hier auch cleanen?
|
||||
metadata[field.tag] = field.text
|
||||
yield text, metadata
|
||||
"""
|
||||
|
||||
|
||||
LANGUAGE = 'de'
|
||||
#PARSER = de_core_news_md.load()
|
||||
PARSER = spacy.load(LANGUAGE)
|
||||
|
||||
from old.textCleaning import TextCleaner
|
||||
|
||||
cleaner = TextCleaner(parser=PARSER)
|
||||
|
||||
|
||||
def generateTextfromTicketXML(path2xml, textfield='Beschreibung', clean=False, normalize_Synonyms=False, lemmatize=False):
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
root = tree.getroot()
|
||||
|
||||
|
||||
for ticket in root:
|
||||
text = "ERROR"
|
||||
for field in ticket:
|
||||
if field.tag == textfield:
|
||||
if clean:
|
||||
text = cleaner.normalizeSynonyms(cleaner.removeWords(cleaner.keepPOSandENT(field.text))) #,normalize_synonyms=normalize_Synonyms,lemmatize=lemmatize)
|
||||
else:
|
||||
text = field.text
|
||||
yield text
|
||||
|
||||
def generateMetadatafromTicketXML(path2xml, textfield='Beschreibung'):#,keys_to_clean=["Loesung","Zusammenfassung"]):
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
root = tree.getroot()
|
||||
|
||||
for ticket in root:
|
||||
metadata = {}
|
||||
for field in ticket:
|
||||
if field.tag != textfield:
|
||||
if field.tag == "Zusammenfassung":
|
||||
metadata[field.tag] = cleaner.removePunctuation(field.text)
|
||||
elif field.tag == "Loesung":
|
||||
metadata[field.tag] = cleaner.removeWhitespace(field.text)
|
||||
else:
|
||||
metadata[field.tag] = field.text
|
||||
|
||||
yield metadata
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
def cleanText_symbols(string, parser=PARSER, custom_symbols=None, keep=None):
|
||||
|
||||
if custom_symbols is not None:
|
||||
custom_symbols = custom_symbols
|
||||
else:
|
||||
custom_symbols = []
|
||||
|
||||
if keep is not None:
|
||||
keep = keep
|
||||
else:
|
||||
keep = []
|
||||
|
||||
# List of symbols we don't care about
|
||||
symbols = ["-----","---","...","“","”",".","-","<",">",",","?","!","..","n’t","n't","|","||",";",":","…","’s","'s",".","(",")","[","]","#"] + custom_symbols
|
||||
|
||||
# parse with spaCy
|
||||
spacy_doc = parser(string)
|
||||
tokens = []
|
||||
|
||||
pos = ["NUM", "SPACE", "PUNCT"]
|
||||
for p in keep:
|
||||
pos.remove(p)
|
||||
|
||||
|
||||
# append Tokens to a list
|
||||
for tok in spacy_doc:
|
||||
if tok.pos_ not in pos and tok.text not in symbols:
|
||||
tokens.append(tok.text)
|
||||
|
||||
return " ".join(tokens)
|
||||
|
||||
def cleanText_words(string,parser=PARSER, custom_stopwords=None, custom_words=None, customPreprocessing=cleanText_symbols, lemmatize=False, normalize_synonyms=False):
|
||||
|
||||
# use preprocessing
|
||||
if customPreprocessing is not None:
|
||||
string = customPreprocessing(string)
|
||||
|
||||
if custom_stopwords is not None:
|
||||
custom_stopwords = custom_stopwords
|
||||
else:
|
||||
custom_stopwords = []
|
||||
|
||||
if custom_words is not None:
|
||||
custom_words = custom_words
|
||||
else:
|
||||
custom_words = []
|
||||
|
||||
|
||||
# custom stoplist
|
||||
# https://stackoverflow.com/questions/9806963/how-to-use-pythons-import-function-properly-import
|
||||
stop_words = __import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS
|
||||
|
||||
stoplist =list(stop_words) + custom_stopwords
|
||||
|
||||
# replace twitter
|
||||
mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
||||
string = mentionFinder.sub("MENTION", string)
|
||||
|
||||
# replace emails
|
||||
emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
||||
string = emailFinder.sub("EMAIL", string)
|
||||
|
||||
# replace urls
|
||||
urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
||||
string = urlFinder.sub("URL", string)
|
||||
|
||||
# replace HTML symbols
|
||||
string = string.replace("&", "and").replace(">", ">").replace("<", "<")
|
||||
|
||||
|
||||
|
||||
# parse with spaCy
|
||||
spacy_doc = parser(string)
|
||||
tokens = []
|
||||
|
||||
added_entities = ["WORK_OF_ART","ORG","PRODUCT", "LOC"]#,"PERSON"]
|
||||
added_POS = ["NOUN"]#, "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
|
||||
|
||||
# append Tokens to a list
|
||||
for tok in spacy_doc:
|
||||
if tok.pos_ in added_POS:
|
||||
if lemmatize:
|
||||
tokens.append(tok.lemma_.lower().strip())
|
||||
else:
|
||||
tokens.append(tok.text.lower().strip())
|
||||
|
||||
# add entities
|
||||
if tok.ent_type_ in added_entities:
|
||||
tokens.append(tok.text.lower())
|
||||
|
||||
|
||||
|
||||
# remove stopwords
|
||||
tokens = [tok for tok in tokens if tok not in stoplist]
|
||||
|
||||
# remove custom_words
|
||||
tokens = [tok for tok in tokens if tok not in custom_words]
|
||||
|
||||
# remove single characters
|
||||
tokens = [tok for tok in tokens if len(tok)>1]
|
||||
|
||||
# remove large strings of whitespace
|
||||
#remove_whitespace(" ".join(tokens))
|
||||
|
||||
|
||||
#idee abkürzungen auflösen (v.a. TU -> Technische Universität): abkürzungsverezeichnis
|
||||
|
||||
if normalize_synonyms:
|
||||
tokens = [str(getFirstSynonym(tok,THESAURUS_list)) for tok in tokens]
|
||||
|
||||
return " ".join(set(tokens))
|
||||
|
||||
def cleanText_removeWhitespace(sentence):
|
||||
whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
|
||||
sentence = whitespaceFinder.sub(" ", sentence)
|
||||
return sentence
|
||||
|
||||
#todo: preprocess pipe: removewhitespace, removePUNCT, resolveAbk, keepPOS, keepEnt, removeWords, normalizeSynonyms
|
||||
|
||||
|
||||
def getFirstSynonym(word, thesaurus_gen):
|
||||
|
||||
word = word.lower()
|
||||
|
||||
|
||||
# durch den thesaurrus iterieren
|
||||
for syn_block in thesaurus_gen: # syn_block ist eine liste mit Synonymen
|
||||
|
||||
for syn in syn_block:
|
||||
syn = syn.lower()
|
||||
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
|
||||
if word == syn:
|
||||
return getHauptform(syn_block, word)
|
||||
else: # falls es ein satz ist
|
||||
if word in syn:
|
||||
return getHauptform(syn_block, word)
|
||||
return word # zur Not, das ursrpüngliche Wort zurückgeben
|
||||
|
||||
def getHauptform(syn_block, word, default_return_first_Syn=False):
|
||||
|
||||
for syn in syn_block:
|
||||
syn = syn.lower()
|
||||
|
||||
if "hauptform" in syn and len(syn.split(" ")) <= 2:
|
||||
# nicht ausgeben, falls es in Klammern steht
|
||||
for w in syn.split(" "):
|
||||
if not re.match(r'\([^)]+\)', w):
|
||||
return w
|
||||
|
||||
if default_return_first_Syn:
|
||||
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
|
||||
for w in syn_block:
|
||||
if not re.match(r'\([^)]+\)', w):
|
||||
return w
|
||||
return word # zur Not, das ursrpüngliche Wort zurückgeben
|
||||
"""
|
||||
|
||||
def printRandomDoc(textacyCorpus):
|
||||
print()
|
||||
|
||||
print("len(textacyCorpus) = %i" % len(textacyCorpus))
|
||||
randIndex = int((len(textacyCorpus) - 1) * random.random())
|
||||
print("Index: {0} ; Text: {1} ; Metadata: {2}".format(randIndex, textacyCorpus[randIndex].text, textacyCorpus[randIndex].metadata))
|
||||
|
||||
print()
|
||||
|
||||
####################'####################'####################'####################'####################'##############
|
||||
# todo config-file
|
||||
|
||||
DATAPATH = "ticketSamples.xml"
|
||||
DATAPATH_thesaurus = "openthesaurus.csv"
|
||||
|
||||
|
||||
|
||||
normalize_Synonyms = True
|
||||
clean = True
|
||||
lemmatize = True
|
||||
|
||||
custom_words = ["grüßen", "fragen"]
|
||||
|
||||
####################'####################'####################'####################'####################'##############
|
||||
|
||||
|
||||
## files to textacy-corpi
|
||||
textacyCorpus = textacy.Corpus(PARSER)
|
||||
|
||||
print("add texts to textacy-corpi...")
|
||||
textacyCorpus.add_texts(texts=generateTextfromTicketXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize), metadatas=generateMetadatafromTicketXML(DATAPATH))
|
||||
|
||||
|
||||
#for txt, dic in generateFromXML(DATAPATH, normalize_Synonyms=normalize_Synonyms, clean=clean, lemmatize=lemmatize):
|
||||
# textacyCorpus.add_text(txt,dic)
|
||||
|
||||
|
||||
|
||||
for doc in textacyCorpus:
|
||||
print(doc.metadata)
|
||||
print(doc.text)
|
||||
|
||||
#print(textacyCorpus[2].text)
|
||||
#printRandomDoc(textacyCorpus)
|
||||
#print(textacyCorpus[len(textacyCorpus)-1].text)
|
||||
|
||||
|
||||
print()
|
||||
print()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
213
old/test.py
213
old/test.py
|
@ -1,213 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import spacy
|
||||
import textacy
|
||||
from spacy.tokens import Doc
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import spacy
|
||||
import functools
|
||||
|
||||
import textacy
|
||||
|
||||
|
||||
class TextCleaner:
|
||||
|
||||
def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
|
||||
"""
|
||||
:param parser: spacy-parser
|
||||
:param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
|
||||
:param customClass_symbols:[str]
|
||||
:param customClass_words:[str]
|
||||
:param customClassPOS:[str]
|
||||
:param keep4All: [str]
|
||||
"""
|
||||
if thesaurus is None:
|
||||
DATAPATH_thesaurus = "openthesaurus.csv"
|
||||
|
||||
## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt
|
||||
self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))
|
||||
else:
|
||||
self.thesaurus = thesaurus
|
||||
|
||||
self.parser = parser
|
||||
|
||||
#self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
|
||||
self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
||||
self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
||||
self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
||||
|
||||
# to keep
|
||||
self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
|
||||
self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
|
||||
|
||||
"""
|
||||
|
||||
# to remove
|
||||
self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||",
|
||||
";", ":",
|
||||
"…", "’s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else [])
|
||||
self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else [])
|
||||
|
||||
|
||||
self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
|
||||
self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])
|
||||
|
||||
|
||||
keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep
|
||||
|
||||
|
||||
# modify those to remove with those to keep
|
||||
for sym in keep:
|
||||
try:
|
||||
self.symbols.remove(sym)
|
||||
except ValueError:
|
||||
pass
|
||||
for sym in keep:
|
||||
try:
|
||||
self.stop_words.remove(sym)
|
||||
except ValueError:
|
||||
pass
|
||||
"""
|
||||
|
||||
def loadString(self,string):
|
||||
self.currentDoc = self.parser(string)
|
||||
|
||||
|
||||
def removeWhitespace(self, string):
|
||||
return " ".join([tok.text for tok in self.currentDoc if not tok.is_space])
|
||||
|
||||
|
||||
def removePunctuation(self, string, custom_symbols=None, keep=None):
|
||||
symbols = self.symbols + (custom_symbols if custom_symbols is not None else [])
|
||||
if hasattr(keep, '__iter__'):
|
||||
for k in keep:
|
||||
try:
|
||||
symbols.remove(k)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return " ".join([tok.text for tok in self.currentDoc if not tok.is_punct and tok.text not in symbols])
|
||||
|
||||
|
||||
def cleanDoc(doc, toDelete=None, toKeep=None):
|
||||
"""
|
||||
:param doc: spacyDoc
|
||||
:param toDelete: [str] pos_ , ent_type_ or tag_
|
||||
:return: str tokenlist
|
||||
"""
|
||||
#keep
|
||||
tokenlist = []
|
||||
for tok in doc:
|
||||
if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep:
|
||||
tokenlist.append(tok.text)
|
||||
|
||||
#delete
|
||||
tokenlist = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete]
|
||||
|
||||
result = " ".join(tokenlist)
|
||||
return result #problem: kein doc und daher nicht komponierbar
|
||||
|
||||
|
||||
def keepinDoc(doc, toKeep=None):
|
||||
"""
|
||||
:param doc: spacyDoc
|
||||
:param toDelete: [str]
|
||||
:return: str tokenlist
|
||||
"""
|
||||
return " ".join([tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep])
|
||||
|
||||
|
||||
# https://mathieularose.com/function-composition-in-python/
|
||||
parser = spacy.load('de')
|
||||
cleaner = TextCleaner(parser)
|
||||
corpus_raw = textacy.Corpus(parser)
|
||||
corpus_clean = textacy.Corpus(parser)
|
||||
|
||||
def foo(doc, toKeep=None):
|
||||
|
||||
words = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]
|
||||
spaces = [True] * len(words)
|
||||
|
||||
return Doc(doc.vocab,words=words,spaces=spaces)
|
||||
|
||||
def foo2(doc, toDelete=None):#, toKeep=None):
|
||||
"""
|
||||
:param doc: spacyDoc
|
||||
:param toDelete: [str] pos_ , ent_type_ or tag_
|
||||
:return: str tokenlist
|
||||
"""
|
||||
#keep
|
||||
#tokenlist = [tok.text for tok in doc if tok.pos_ in toKeep or tok.ent_type_ in toKeep or tok.tag_ in toKeep]
|
||||
|
||||
#delete
|
||||
|
||||
words = [tok.text for tok in doc if tok.pos_ in toDelete and not tok.ent_type_ in toDelete and not tok.tag_ in toDelete]
|
||||
spaces = [True] * len(words)
|
||||
|
||||
return Doc(doc.vocab, words=words, spaces=spaces)
|
||||
|
||||
|
||||
"""
|
||||
def compose(self,*functions):
|
||||
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
|
||||
|
||||
def composeo(*functions):
|
||||
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions)
|
||||
"""
|
||||
|
||||
def double(a):
|
||||
return a*2
|
||||
|
||||
def add(a, b):
|
||||
return a+b
|
||||
|
||||
def compose(*functions):
|
||||
def compose2(f, g):
|
||||
return lambda x: f(g(x))
|
||||
return functools.reduce(compose2, functions, lambda x: x)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#pipeline = compose(removeFromDoc, cleaner.removeWhitespace, cleaner.loadString)
|
||||
"""
|
||||
def pipe1(string):
|
||||
cleaner.loadString(string)
|
||||
string = cleaner.removeWhitespace(string)
|
||||
string = cleaner.removePunctuation(string)
|
||||
return string
|
||||
"""
|
||||
|
||||
def cleaningPipe(spacy_pipe, composition):
|
||||
for doc in spacy_pipe:
|
||||
yield composition(doc)
|
||||
|
||||
|
||||
pipeline = compose(
|
||||
functools.partial(foo2, toDelete=["PUNCT", "SPACE"]),
|
||||
functools.partial(foo, toKeep=["NOUN"]))
|
||||
|
||||
|
||||
string = "Frau Hinrichs überdenkt die tu Situation und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
|
||||
|
||||
doc = parser(string)
|
||||
|
||||
#print(removeFromDoc(doc,toDelete=["PUNCT"]))
|
||||
|
||||
print(pipeline(doc.text))
|
||||
|
||||
|
||||
|
||||
for txt in cleaningPipe(parser.pipe([string]),pipeline):
|
||||
print(txt)
|
||||
"""
|
||||
corpus_raw.add_text(string)
|
||||
for doc in parser.pipe([string]):
|
||||
doc.text = removeFromDoc(doc, toDelete=["PUNCT"])
|
||||
"""
|
||||
|
||||
#corpus_clean.add_texts(cleaningPipe(parser.pipe([string]),pipeline))
|
||||
#print(corpus_raw[0].text)
|
||||
|
199
old/testo.py
199
old/testo.py
|
@ -1,199 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import functools
|
||||
import re
|
||||
|
||||
import spacy
|
||||
import textacy
|
||||
from spacy.tokens import Doc
|
||||
from spacy.tagger import Tagger
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
PARSER = spacy.load('de')
|
||||
stop_words = list(__import__("spacy." + PARSER.lang, globals(), locals(), ['object']).STOP_WORDS)
|
||||
|
||||
def compose(*functions):
|
||||
def compose2(f, g):
|
||||
return lambda x: f(g(x))
|
||||
return functools.reduce(compose2, functions, lambda x: x)
|
||||
|
||||
|
||||
def cleanTexts(textstream, parser, attr):
|
||||
|
||||
#input str-stream output str-stream
|
||||
pipe = parser.pipe(textstream)
|
||||
|
||||
for doc in pipe:
|
||||
|
||||
tokens = [tok.text for tok in doc
|
||||
if tok.pos_ not in attr
|
||||
and tok.tag_ not in attr
|
||||
and tok.ent_ not in attr
|
||||
and tok.text not in attr
|
||||
and tok.lower_ not in attr]
|
||||
|
||||
|
||||
yield " ".join(tokens)
|
||||
|
||||
|
||||
"""
|
||||
def cleanDoc_lemmatize(doc,parser=PARSER):
|
||||
return parser(" ".join([tok.lemma_ for tok in doc ]))
|
||||
|
||||
|
||||
def cleanDoc_STOPS(doc,parser=PARSER, stop_words=None, keep=None):
|
||||
if stop_words is None:
|
||||
stop_words = list(__import__("spacy." + parser.lang, globals(), locals(), ['object']).STOP_WORDS)
|
||||
|
||||
if hasattr(keep, '__iter__'):
|
||||
for k in keep:
|
||||
try:
|
||||
stop_words.remove(k)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
return parser(" ".join([tok.text for tok in doc if tok.text not in stop_words]))
|
||||
|
||||
|
||||
|
||||
def cleanDoc_ENT(doc,parser=PARSER, keeponly=False, attr=["WORK_OF_ART", "ORG", "PRODUCT", "LOC"]):
|
||||
if keeponly:
|
||||
return parser(" ".join([tok.text for tok in doc if tok.ent_ in attr]))
|
||||
else:
|
||||
return parser(" ".join([tok.text for tok in doc if tok.ent_ not in attr]))
|
||||
|
||||
|
||||
|
||||
def cleanDoc_POS(doc,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
|
||||
if keeponly:
|
||||
return parser(" ".join([tok.text for tok in doc if tok.pos_ in attr]))
|
||||
else:
|
||||
return parser(" ".join([tok.text for tok in doc if tok.pos_ not in attr]))
|
||||
"""
|
||||
|
||||
|
||||
def cleanTexts_POS(spacypipe, keeponly=False, attr=["SPACE", "PUNCT"]):
|
||||
"""
|
||||
:param spacypipe: spacypipe
|
||||
:param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
|
||||
:param attr: [str] pos_ or ent_type_
|
||||
:yields: stream of strings: full-length cleaned text
|
||||
"""
|
||||
if keeponly:
|
||||
for doc in spacypipe:
|
||||
yield " ".join([tok.text for tok in doc if tok.pos_ in attr])
|
||||
|
||||
else:
|
||||
for doc in spacypipe:
|
||||
yield " ".join([tok.text for tok in doc if tok.pos_ not in attr])
|
||||
|
||||
def cleanText_POS(text,parser=PARSER, keeponly=False, attr=["SPACE", "PUNCT"]):
|
||||
"""
|
||||
:param txt: str
|
||||
:param keeponly: bool . If True, only attr will be kept. If false, all attr will be deleted
|
||||
:param attr: [str] pos_ or ent_type_
|
||||
:return: str
|
||||
"""
|
||||
doc = parser(text)
|
||||
|
||||
if keeponly:
|
||||
return " ".join([tok.text for tok in doc if tok.pos_ in attr])
|
||||
else:
|
||||
return " ".join([tok.text for tok in doc if tok.pos_ not in attr])
|
||||
|
||||
|
||||
def removeWhitespace(string):
|
||||
return re.sub(r'(\r\n|\r|\n|(\s)+)', ' ', string)
|
||||
|
||||
def removeWords(string, words):
|
||||
big_regex = re.compile('|'.join(map(re.escape, words)))
|
||||
return big_regex.sub("", string)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def generateMainTextfromTicketXML(path2xml, main_textfield='Beschreibung', cleaning_function=None):
|
||||
"""
|
||||
generates strings from XML
|
||||
:param path2xml:
|
||||
:param main_textfield:
|
||||
:param cleaning_function:
|
||||
:yields strings
|
||||
"""
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
root = tree.getroot()
|
||||
|
||||
|
||||
for ticket in root:
|
||||
text = "ERROR"
|
||||
for field in ticket:
|
||||
if field.tag == main_textfield:
|
||||
if cleaning_function:
|
||||
text = cleaning_function(field.text)
|
||||
else:
|
||||
text = field.text
|
||||
yield text
|
||||
|
||||
def generateMetadatafromTicketXML(path2xml, key_function_pairs_to_clean, leave_out=['Beschreibung']):
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
root = tree.getroot()
|
||||
|
||||
for ticket in root:
|
||||
metadata = {}
|
||||
for field in ticket:
|
||||
if field.tag not in leave_out:
|
||||
|
||||
if field.tag in key_function_pairs_to_clean:
|
||||
metadata[field.tag] = key_function_pairs_to_clean[field.tag](field.text)
|
||||
else:
|
||||
metadata[field.tag] = field.text
|
||||
|
||||
yield metadata
|
||||
|
||||
|
||||
|
||||
|
||||
string = "Frau Hinrichs überdenkt die tu Situation a@bc.de und 545453 macht ' dann neue Anträge. \n Dieses Ticket wird geschlossen \n \n test"
|
||||
|
||||
#print(removeWords(string,["die", "neue"]))
|
||||
|
||||
# in:str out:str
|
||||
cleanString = compose(
|
||||
cleanText_POS,
|
||||
functools.partial(textacy.preprocess.replace_emails, replace_with=u'EMAIL')
|
||||
)
|
||||
|
||||
key_function_pairs_to_clean = {
|
||||
"Loesung":removeWhitespace,
|
||||
"Zusammenfassung":cleanText_POS
|
||||
}
|
||||
"""
|
||||
# in:str-gen out:str-gen
|
||||
cleanStream = compose(
|
||||
removeSTOP,
|
||||
lemmatize,
|
||||
cleanEnt
|
||||
)
|
||||
"""
|
||||
# content: xml -> stringCleaning -> pipe -> docCleaning -> corpi
|
||||
# metadata:xml -> -> stringCleaning -> corpi
|
||||
|
||||
corpus = textacy.Corpus(PARSER)
|
||||
|
||||
|
||||
|
||||
|
||||
corpus.add_texts(
|
||||
cleanTexts(generateMainTextfromTicketXML("ticketSamples.xml"),PARSER,["PUNCT","SPACE","PERSON"])#,
|
||||
#generateMetadatafromTicketXML("ticketSamples.xml",key_function_pairs_to_clean=key_function_pairs_to_clean)
|
||||
)
|
||||
|
||||
print(corpus[0].text)
|
||||
|
|
@ -1,263 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import re
|
||||
import spacy
|
||||
import functools
|
||||
|
||||
import textacy
|
||||
|
||||
|
||||
class TextCleaner:
|
||||
|
||||
def __init__(self, parser, thesaurus=None, customClass_symbols=None, customClass_words=None, keep4All=None):
|
||||
"""
|
||||
:param parser: spacy-parser
|
||||
:param thesaurus: [[syn1, syn2, ...],[syn1, syn2, ...], ...]
|
||||
:param customClass_symbols:[str]
|
||||
:param customClass_words:[str]
|
||||
:param customClassPOS:[str]
|
||||
:param keep4All: [str]
|
||||
"""
|
||||
if thesaurus is None:
|
||||
DATAPATH_thesaurus = "openthesaurus.csv"
|
||||
|
||||
## !!!!!! list wichtig, da sonst nicht die gleichen Synonyme zurückgegeben werden, weil ein generator während der laufzeit pickt
|
||||
self.thesaurus = list(textacy.fileio.read_csv(DATAPATH_thesaurus, delimiter=";"))
|
||||
else:
|
||||
self.thesaurus = thesaurus
|
||||
|
||||
self.parser = parser
|
||||
|
||||
|
||||
|
||||
self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
|
||||
self.mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
|
||||
self.emailFinder = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
||||
self.urlFinder = re.compile(r"^(?:https?:\/\/)?(?:www\.)?[a-zA-Z0-9./]+$", re.IGNORECASE)
|
||||
|
||||
|
||||
|
||||
# to remove
|
||||
self.symbols = ["-----", "---", "...", "“", "”", ".", "-", "<", ">", ",", "?", "!", "..", "n’t", "n't", "|", "||",
|
||||
";", ":",
|
||||
"…", "’s", "'s", ".", "(", ")", "[", "]", "#"] + (customClass_symbols if customClass_symbols is not None else [])
|
||||
self.stop_words = list(__import__("spacy." + self.parser.lang, globals(), locals(), ['object']).STOP_WORDS)+ (customClass_words if customClass_words is not None else [])
|
||||
|
||||
|
||||
|
||||
# to keep
|
||||
self.entities2keep = ["WORK_OF_ART", "ORG", "PRODUCT", "LOC"] # ,"PERSON"]
|
||||
self.pos2keep = ["NOUN"] # , "NUM" ]#,"VERB","ADJ"] #fürs TopicModeling nur Nomen http://aclweb.org/anthology/U15-1013
|
||||
|
||||
self.entities2keep = self.entities2keep + (keep4All if keep4All is not None else [])
|
||||
self.pos2keep = self.pos2keep + (keep4All if keep4All is not None else [])
|
||||
|
||||
|
||||
keep = (keep4All if hasattr(keep4All, '__iter__') else []) + self.pos2keep + self.entities2keep
|
||||
|
||||
|
||||
# modify those to remove with those to keep
|
||||
for sym in keep:
|
||||
try:
|
||||
self.symbols.remove(sym)
|
||||
except ValueError:
|
||||
pass
|
||||
for sym in keep:
|
||||
try:
|
||||
self.stop_words.remove(sym)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
# idee self.currentDoc = spacy.Doc für jeden String aber nicht füpr jede methode
|
||||
def loadString(self,string):
|
||||
self.currentDoc = self.parser(string)
|
||||
|
||||
"""
|
||||
def removeWhitespace(self, string):
|
||||
string = self.whitespaceFinder.sub(" ", string)
|
||||
return string
|
||||
"""
|
||||
def removeWhitespace(self, string):
|
||||
return string
|
||||
|
||||
#self.whitespaceFinder = re.compile(r'(\r\n|\r|\n|(\s)+)', re.IGNORECASE)
|
||||
|
||||
def removePunctuation(self, string, custom_symbols=None, keep=None):
|
||||
|
||||
|
||||
symbols = self.symbols + (custom_symbols if custom_symbols is not None else [])
|
||||
|
||||
if hasattr(keep, '__iter__'):
|
||||
for k in keep:
|
||||
try:
|
||||
symbols.remove(k)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
# parse with spaCy
|
||||
doc = self.parser(string)
|
||||
tokens = []
|
||||
|
||||
# append Tokens to a list
|
||||
for tok in doc:
|
||||
if not tok.is_punct and not tok.is_space and tok.text not in symbols:
|
||||
tokens.append(tok.text)
|
||||
|
||||
return " ".join(tokens)
|
||||
|
||||
def keepPOSandENT(self, string, customPOS=None, customEnt=None, remove=None):
|
||||
|
||||
pos2keep = self.pos2keep + (customPOS if customPOS is not None else [])
|
||||
ent = self.entities2keep + (customEnt if customEnt is not None else [])
|
||||
|
||||
if hasattr(remove, '__iter__'):
|
||||
for k in remove:
|
||||
try:
|
||||
ent.remove(k)
|
||||
except ValueError:
|
||||
try:
|
||||
pos2keep.remove(k)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# parse with spaCy
|
||||
spacy_doc = self.parser(string)
|
||||
tokens = []
|
||||
|
||||
# append Tokens to a list
|
||||
for tok in spacy_doc:
|
||||
|
||||
if tok.pos_ in pos2keep:
|
||||
tokens.append(tok.text)
|
||||
|
||||
if tok.ent_type_ in ent:
|
||||
tokens.append(tok.text)
|
||||
|
||||
return " ".join(set(tokens))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def resolveAbbreviations(self,string):
|
||||
return string #todo
|
||||
def removeWords(self,string, custom_words=None, keep=None, lemmatize=False):
|
||||
|
||||
wordlist = self.stop_words + (custom_words if custom_words is not None else [])
|
||||
if hasattr(keep, '__iter__'):
|
||||
for k in keep:
|
||||
try:
|
||||
wordlist.remove(k)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
string = self.urlFinder.sub("URL", string)
|
||||
string = self.emailFinder.sub("EMAIL", string)
|
||||
string = self.mentionFinder.sub("MENTION", string)
|
||||
string = string.replace("&", "and").replace(">", ">").replace("<", "<")
|
||||
|
||||
|
||||
# parse with spaCy
|
||||
spacy_doc = self.parser(string)
|
||||
tokens = []
|
||||
|
||||
# append Tokens to a list
|
||||
for tok in spacy_doc:
|
||||
|
||||
#do not include stopwords/customwords and single chars
|
||||
if tok.text not in wordlist and len(tok)>1:
|
||||
if lemmatize:
|
||||
tokens.append(tok.lemma_)
|
||||
else:
|
||||
tokens.append(tok.lower_)
|
||||
return " ".join(set(tokens))
|
||||
|
||||
|
||||
def normalizeSynonyms(self, string, default_return_first_Syn=False):
|
||||
# parse with spaCy
|
||||
spacy_doc = self.parser(string)
|
||||
tokens = []
|
||||
|
||||
tokens = [str(self.getFirstSynonym(tok, self.thesaurus, default_return_first_Syn=default_return_first_Syn)) for tok in spacy_doc]
|
||||
|
||||
return " ".join(set(tokens))
|
||||
|
||||
def getFirstSynonym(self,word, thesaurus, default_return_first_Syn=False):
|
||||
if not isinstance(word, str):
|
||||
return word
|
||||
|
||||
|
||||
word = word.lower()
|
||||
|
||||
|
||||
# durch den thesaurrus iterieren
|
||||
for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen
|
||||
|
||||
for syn in syn_block:
|
||||
syn = syn.lower()
|
||||
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
|
||||
if word == syn:
|
||||
return self.getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn)
|
||||
else: # falls es ein satz ist
|
||||
if word in syn:
|
||||
return self.getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn)
|
||||
return word # zur Not, das ursrpüngliche Wort zurückgeben
|
||||
|
||||
def getHauptform(self,syn_block, word, default_return_first_Syn=False):
|
||||
|
||||
for syn in syn_block:
|
||||
syn = syn.lower()
|
||||
|
||||
if "hauptform" in syn and len(syn.split(" ")) <= 2:
|
||||
# nicht ausgeben, falls es in Klammern steht
|
||||
for w in syn.split(" "):
|
||||
if not re.match(r'\([^)]+\)', w):
|
||||
return w
|
||||
|
||||
if default_return_first_Syn:
|
||||
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
|
||||
for w in syn_block:
|
||||
if not re.match(r'\([^)]+\)', w):
|
||||
return w
|
||||
return word # zur Not, das ursrpüngliche Wort zurückgeben
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
#################################################################################################################
|
||||
|
||||
#todo funzt irgendwie nich wie's soll: https://mathieularose.com/function-composition-in-python/
|
||||
def compose(self,*functions):
|
||||
return functools.reduce(lambda f, g: lambda x: f(g(x)), functions, lambda x: x)
|
||||
|
||||
pipeline = compose(functools.partial(cleaner.keepPOSandENT,lemmatize=True))#, cleaner.normalizeSynonyms)
|
||||
|
||||
#################################################################################################################
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
37174
openthesaurus.csv
37174
openthesaurus.csv
File diff suppressed because it is too large
Load Diff
152
preprocessing.py
152
preprocessing.py
|
@ -24,16 +24,7 @@ with open(config_ini) as f:
|
|||
config.read_file(f)
|
||||
|
||||
|
||||
global REGEX_SPECIALCHAR
|
||||
global REGEX_TOPLVL
|
||||
|
||||
global THESAURUS
|
||||
global WORDS
|
||||
global LEMMAS
|
||||
global NOUNS
|
||||
global VORNAMEN
|
||||
global DE_STOP_WORDS
|
||||
global EN_STOP_WORDS
|
||||
|
||||
REGEX_SPECIALCHAR = r'[`\-=~%^&*()_+\[\]{};\'\\:"|</>]' #+r',.'
|
||||
REGEX_TOPLVL = r'\.[a-z]{2,3}(\.[a-z]{2,3})?'
|
||||
|
@ -47,8 +38,20 @@ VORNAMEN= {}
|
|||
DE_STOP_WORDS= {}
|
||||
EN_STOP_WORDS= {}
|
||||
|
||||
|
||||
|
||||
############# filter tokens
|
||||
|
||||
|
||||
def filterTokens(tokens, funclist):
|
||||
# in:tokenlist, funclist
|
||||
# out: tokenlist
|
||||
for f in funclist:
|
||||
tokens = list(filter(f, tokens))
|
||||
|
||||
return tokens
|
||||
|
||||
|
||||
def keepPOS(pos_list):
|
||||
return lambda tok: tok.pos_ in pos_list
|
||||
|
||||
|
@ -107,7 +110,7 @@ def remove_first_names():
|
|||
############# strings
|
||||
|
||||
def remove_addresses(string):
|
||||
pass # todo
|
||||
pass # todo remove_addresses idee postal.parser und zu metadaten hinzufügen
|
||||
|
||||
def lemmatizeWord(word,lemma_dict=LEMMAS,n=3):
|
||||
for i in range(n):
|
||||
|
@ -183,55 +186,9 @@ def autocorrectWord(word):
|
|||
|
||||
|
||||
############# stringcleaning
|
||||
@deprecated
|
||||
def stringcleaning(stringstream):
|
||||
|
||||
|
||||
for string in stringstream:
|
||||
string = string.lower()
|
||||
|
||||
# fixUnicode
|
||||
string = textacy.preprocess.fix_bad_unicode(string.lower(), normalization=u'NFC')
|
||||
|
||||
# remove_words_containing_topLVL
|
||||
string = " ".join([w.lower() for w in string.split() if not re.search(REGEX_TOPLVL, w)])
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
# seperate_words_on_regex:
|
||||
string = " ".join(re.compile(REGEX_SPECIALCHAR).split(string))
|
||||
|
||||
# cut_after
|
||||
word = "gruss" #idee addressen enfernen --> postal.parser
|
||||
string = string.rpartition(word)[0] if word in string else string
|
||||
|
||||
# lemmatize
|
||||
string = " ".join([lemmatizeWord(word) for word in string.split()])
|
||||
|
||||
# synonyme normalisieren #idee vor oder nach lemmatize?
|
||||
string = " ".join([getFirstSynonym(word) for word in string.split()])
|
||||
|
||||
# autocorrect
|
||||
string = " ".join([autocorrectWord(word) for word in string.split()])
|
||||
|
||||
yield string
|
||||
|
||||
|
||||
|
||||
|
||||
def filterTokens(tokens, funclist):
|
||||
# in:tokenlist, funclist
|
||||
# out: tokenlist
|
||||
for f in funclist:
|
||||
tokens = list(filter(f, tokens))
|
||||
|
||||
return tokens
|
||||
|
||||
def processContentstream2(textstream, parser, token_filterlist=None):
|
||||
def processContentstream(textstream, parser, token_filterlist=None):
|
||||
|
||||
#pre parse
|
||||
textstream = preparse(textstream)
|
||||
|
@ -247,7 +204,7 @@ def processContentstream2(textstream, parser, token_filterlist=None):
|
|||
tokens = filterTokens(tokens, token_filterlist)
|
||||
|
||||
# post parse
|
||||
tokens = [postparse(tok) for tok in tokens] #todo informationsverlust von pos,tag etc.!
|
||||
tokens = [postparse(tok) for tok in tokens] #todo: informationsverlust von pos,tag etc.!
|
||||
|
||||
yield " ".join(tokens)
|
||||
|
||||
|
@ -256,7 +213,6 @@ def preparse(stringstream):
|
|||
for string in stringstream:
|
||||
|
||||
# cut_after
|
||||
# todo addressen enfernen --> postal.parser idee zu metadaten hinzufügen
|
||||
words = ["gruss", "grusse","gruesse","gruessen","grusses"]
|
||||
|
||||
for gr in words:
|
||||
|
@ -287,39 +243,6 @@ def postparse(toktext):
|
|||
|
||||
return toktext
|
||||
|
||||
def corpus2Text(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.text
|
||||
|
||||
def corpus2Meta(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.metadata
|
||||
|
||||
@deprecated
|
||||
def processContentstream(textstream, parser, token_filterlist=None):
|
||||
"""
|
||||
:param textstream: string-gen
|
||||
:param funclist: [func]
|
||||
:param parser: spacy-parser
|
||||
:return: string-gen
|
||||
"""
|
||||
|
||||
# pre_parse
|
||||
textstream = stringcleaning(textstream)
|
||||
|
||||
pipe = parser.pipe(textstream)
|
||||
|
||||
tokens = []
|
||||
for doc in pipe:
|
||||
|
||||
tokens = [tok for tok in doc]
|
||||
|
||||
# in_parse
|
||||
if token_filterlist is not None:
|
||||
tokens = filterTokens(tokens, token_filterlist)
|
||||
|
||||
yield " ".join([tok.lower_ for tok in tokens])
|
||||
# yield " ".join(list(set([tok.lower_ for tok in tokens])))
|
||||
|
||||
def processDictstream(dictstream, funcdict, parser):
|
||||
"""
|
||||
|
@ -356,30 +279,30 @@ def processDictstream(dictstream, funcdict, parser):
|
|||
|
||||
##################################################################################################
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/preprocessing.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_preprocessing.log &"
|
||||
|
||||
ressources_path = FILEPATH + "ressources/"
|
||||
|
||||
path2thesaurus_dict = ressources_path + config.get("thesaurus","pickle_file")
|
||||
path2wordsdict = ressources_path + config.get("spellchecking", "pickle_file")
|
||||
path2lemmadict = ressources_path + config.get("lemmatization","pickle_file")
|
||||
path2nouns_list = ressources_path + config.get("nouns","pickle_file")
|
||||
path2firstnameslist = ressources_path + config.get("firstnames","pickle_file")
|
||||
|
||||
|
||||
path2thesaurus_dict = FILEPATH + config.get("thesaurus","pickle_file")
|
||||
path2DEstopwordlist = ressources_path + config.get("de_stopwords", "pickle_file")
|
||||
path2ENstopwordlist = ressources_path + config.get("en_stopwords", "pickle_file")
|
||||
|
||||
path2wordsdict = FILEPATH + config.get("spellchecking", "pickle_file")
|
||||
custom_words = get_list_from_config("preprocessing", "custom_words")
|
||||
|
||||
path2lemmadict = FILEPATH + config.get("lemmatization","pickle_file")
|
||||
|
||||
path2nouns_list = FILEPATH + config.get("nouns","pickle_file")
|
||||
|
||||
path2firstnameslist = FILEPATH + config.get("firstnames","pickle_file")
|
||||
|
||||
path2DEstopwordlist = FILEPATH + config.get("de_stopwords", "pickle_file")
|
||||
|
||||
path2ENstopwordlist = FILEPATH + config.get("en_stopwords", "pickle_file")
|
||||
|
||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||
de_plainpath = FILEPATH + config.get("de_corpus", "path") + "pre_labled_lines.txt"
|
||||
|
||||
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||
|
||||
|
||||
|
||||
|
||||
def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", printrandom=10):
|
||||
|
||||
logprint("Preprocess {0}_corpus at {1}".format(lang, datetime.now()))
|
||||
|
@ -387,8 +310,8 @@ def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", print
|
|||
cleanCorpus_name = lang + "_clean_ticket"
|
||||
preCorpus_name = lang + "_pre_ticket"
|
||||
|
||||
logprint("Load {0}_raw".format(lang))
|
||||
#load raw corpus and create new one
|
||||
logprint("Load {0}_raw".format(lang))
|
||||
clean_corpus, parser = load_corpus(corpus_name=cleanCorpus_name, corpus_path=corpus_path)
|
||||
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
@ -396,7 +319,7 @@ def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", print
|
|||
|
||||
## process and add files to textacy-corpi,
|
||||
corpus.add_texts(
|
||||
processContentstream2(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser),
|
||||
processContentstream(corpus2Text(clean_corpus), token_filterlist=filter_tokens, parser=parser),
|
||||
processDictstream(corpus2Meta(clean_corpus), clean_in_meta,parser=parser)
|
||||
)
|
||||
|
||||
|
@ -409,22 +332,16 @@ def preprocessCorpus(corpus_path, filter_tokens, clean_in_meta, lang="de", print
|
|||
printRandomDoc(corpus)
|
||||
|
||||
|
||||
|
||||
#save corpus
|
||||
save_corpus(corpus=corpus, corpus_path=corpus_path, corpus_name=preCorpus_name)
|
||||
|
||||
|
||||
#save corpus as labled, plain text
|
||||
plainpath = FILEPATH + config.get("de_corpus", "path") + "pre_labled_lines.txt"
|
||||
textacy.fileio.write_file_lines(labledCorpiLines(corpus),filepath=plainpath )
|
||||
savelabledCorpiLines(corpus, de_plainpath)
|
||||
|
||||
|
||||
return corpus
|
||||
|
||||
def labledCorpiLines(corpus):
|
||||
for doc in corpus:
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||
yield "[" + doc.metadata["categoryName"] + "] " + doc.text
|
||||
|
||||
|
||||
def main():
|
||||
start = time.time()
|
||||
|
@ -438,10 +355,8 @@ def main():
|
|||
NOUNS = load_obj(path2nouns_list)
|
||||
VORNAMEN = load_obj(path2firstnameslist)
|
||||
|
||||
custom_words = config.get("preprocessing","custom_words").split(",")
|
||||
|
||||
filter_tokens = [
|
||||
# removeENT(["PERSON"]),
|
||||
|
||||
keepNouns(NOUNS),
|
||||
|
||||
|
@ -465,13 +380,10 @@ def main():
|
|||
"categoryName": [removePOS(["SPACE", "PUNCT"])]
|
||||
}
|
||||
|
||||
|
||||
corpus = preprocessCorpus(corpus_de_path, filter_tokens, clean_in_meta, "de",printrandom=5)
|
||||
|
||||
#from topicModeling import jgibbsLLDA
|
||||
|
||||
#jgibbsLLDA(corpus)
|
||||
|
||||
#preprocessCorpus(corpus_en_path, filter_tokens, clean_in_meta, "en" )
|
||||
|
||||
end = time.time()
|
||||
logprint("Time Elapsed Preprocessing:{0} min".format((end - start) / 60))
|
||||
|
|
58
spell.py
58
spell.py
|
@ -1,58 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
# https://github.com/norvig/pytudes/blob/master/spell.py
|
||||
|
||||
"""Spelling Corrector in Python 3; see http://norvig.com/spell-correct.html
|
||||
|
||||
Copyright (c) 2007-2016 Peter Norvig
|
||||
MIT license: www.opensource.org/licenses/mit-license.php
|
||||
"""
|
||||
|
||||
################ Spelling Corrector
|
||||
|
||||
import re
|
||||
from collections import Counter
|
||||
import spacy
|
||||
import textacy
|
||||
def words(text): return re.findall(r'\w+', text.lower())
|
||||
|
||||
|
||||
|
||||
|
||||
WORDS = Counter(words(open('bigo.txt').read()))
|
||||
x=0
|
||||
def P(word, N=sum(WORDS.values())):
|
||||
"Probability of `word`."
|
||||
return WORDS[word] / N
|
||||
|
||||
|
||||
def correction(word):
|
||||
"Most probable spelling correction for word."
|
||||
return max(candidates(word), key=P)
|
||||
|
||||
|
||||
def candidates(word):
|
||||
"Generate possible spelling corrections for word."
|
||||
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
|
||||
|
||||
|
||||
def known(words):
|
||||
"The subset of `words` that appear in the dictionary of WORDS."
|
||||
return set(w for w in words if w in WORDS)
|
||||
|
||||
|
||||
def edits1(word):
|
||||
"All edits that are one edit away from `word`."
|
||||
letters = 'abcdefghijklmnopqrstuvwxyz'
|
||||
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
|
||||
deletes = [L + R[1:] for L, R in splits if R]
|
||||
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
|
||||
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
|
||||
inserts = [L + c + R for L, R in splits for c in letters]
|
||||
return set(deletes + transposes + replaces + inserts)
|
||||
|
||||
|
||||
def edits2(word):
|
||||
"All edits that are two edits away from `word`."
|
||||
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
||||
|
622
stopwords-de.txt
622
stopwords-de.txt
|
@ -1,622 +0,0 @@
|
|||
a
|
||||
ab
|
||||
aber
|
||||
ach
|
||||
acht
|
||||
achte
|
||||
trotz
|
||||
achten
|
||||
achter
|
||||
achtes
|
||||
ag
|
||||
alle
|
||||
allein
|
||||
allem
|
||||
allen
|
||||
aller
|
||||
allerdings
|
||||
alles
|
||||
allgemeinen
|
||||
als
|
||||
also
|
||||
am
|
||||
an
|
||||
ander
|
||||
andere
|
||||
anderem
|
||||
anderen
|
||||
anderer
|
||||
anderes
|
||||
anderm
|
||||
andern
|
||||
anderr
|
||||
anders
|
||||
au
|
||||
auch
|
||||
auf
|
||||
aus
|
||||
ausser
|
||||
ausserdem
|
||||
außer
|
||||
außerdem
|
||||
b
|
||||
bald
|
||||
bei
|
||||
beide
|
||||
beiden
|
||||
beim
|
||||
beispiel
|
||||
bekannt
|
||||
bereits
|
||||
besonders
|
||||
besser
|
||||
besten
|
||||
bin
|
||||
bis
|
||||
bisher
|
||||
bist
|
||||
c
|
||||
d
|
||||
d.h
|
||||
da
|
||||
dabei
|
||||
dadurch
|
||||
dafür
|
||||
dagegen
|
||||
daher
|
||||
dahin
|
||||
dahinter
|
||||
damals
|
||||
damit
|
||||
danach
|
||||
daneben
|
||||
dank
|
||||
dann
|
||||
daran
|
||||
darauf
|
||||
daraus
|
||||
darf
|
||||
darfst
|
||||
darin
|
||||
darum
|
||||
darunter
|
||||
darüber
|
||||
das
|
||||
dasein
|
||||
daselbst
|
||||
dass
|
||||
dasselbe
|
||||
davon
|
||||
davor
|
||||
dazu
|
||||
dazwischen
|
||||
daß
|
||||
dein
|
||||
deine
|
||||
deinem
|
||||
deinen
|
||||
deiner
|
||||
deines
|
||||
dem
|
||||
dementsprechend
|
||||
demgegenüber
|
||||
demgemäss
|
||||
demgemäß
|
||||
demselben
|
||||
demzufolge
|
||||
den
|
||||
denen
|
||||
denn
|
||||
denselben
|
||||
der
|
||||
deren
|
||||
derer
|
||||
derjenige
|
||||
derjenigen
|
||||
dermassen
|
||||
dermaßen
|
||||
derselbe
|
||||
derselben
|
||||
des
|
||||
deshalb
|
||||
desselben
|
||||
dessen
|
||||
deswegen
|
||||
dich
|
||||
die
|
||||
diejenige
|
||||
diejenigen
|
||||
dies
|
||||
diese
|
||||
dieselbe
|
||||
dieselben
|
||||
diesem
|
||||
diesen
|
||||
dieser
|
||||
dieses
|
||||
dir
|
||||
doch
|
||||
dort
|
||||
drei
|
||||
drin
|
||||
dritte
|
||||
dritten
|
||||
dritter
|
||||
drittes
|
||||
du
|
||||
durch
|
||||
durchaus
|
||||
durfte
|
||||
durften
|
||||
dürfen
|
||||
dürft
|
||||
e
|
||||
eben
|
||||
ebenso
|
||||
ehrlich
|
||||
ei
|
||||
ei,
|
||||
eigen
|
||||
eigene
|
||||
eigenen
|
||||
eigener
|
||||
eigenes
|
||||
ein
|
||||
einander
|
||||
eine
|
||||
einem
|
||||
einen
|
||||
einer
|
||||
eines
|
||||
einig
|
||||
einige
|
||||
einigem
|
||||
einigen
|
||||
einiger
|
||||
einiges
|
||||
einmal
|
||||
eins
|
||||
elf
|
||||
en
|
||||
ende
|
||||
endlich
|
||||
entweder
|
||||
er
|
||||
ernst
|
||||
erst
|
||||
erste
|
||||
ersten
|
||||
erster
|
||||
erstes
|
||||
es
|
||||
etwa
|
||||
etwas
|
||||
euch
|
||||
euer
|
||||
eure
|
||||
eurem
|
||||
euren
|
||||
eurer
|
||||
eures
|
||||
f
|
||||
folgende
|
||||
früher
|
||||
fünf
|
||||
fünfte
|
||||
fünften
|
||||
fünfter
|
||||
fünftes
|
||||
für
|
||||
g
|
||||
gab
|
||||
ganz
|
||||
ganze
|
||||
ganzen
|
||||
ganzer
|
||||
ganzes
|
||||
gar
|
||||
gedurft
|
||||
gegen
|
||||
gegenüber
|
||||
gehabt
|
||||
gehen
|
||||
geht
|
||||
gekannt
|
||||
gekonnt
|
||||
gemacht
|
||||
gemocht
|
||||
gemusst
|
||||
genug
|
||||
gerade
|
||||
gern
|
||||
gesagt
|
||||
geschweige
|
||||
gewesen
|
||||
gewollt
|
||||
geworden
|
||||
gibt
|
||||
ging
|
||||
gleich
|
||||
gott
|
||||
gross
|
||||
grosse
|
||||
grossen
|
||||
grosser
|
||||
grosses
|
||||
groß
|
||||
große
|
||||
großen
|
||||
großer
|
||||
großes
|
||||
gut
|
||||
gute
|
||||
guter
|
||||
gutes
|
||||
h
|
||||
hab
|
||||
habe
|
||||
haben
|
||||
habt
|
||||
hast
|
||||
hat
|
||||
hatte
|
||||
hatten
|
||||
hattest
|
||||
hattet
|
||||
heisst
|
||||
her
|
||||
heute
|
||||
hier
|
||||
hin
|
||||
hinter
|
||||
hoch
|
||||
hätte
|
||||
hätten
|
||||
i
|
||||
ich
|
||||
ihm
|
||||
ihn
|
||||
ihnen
|
||||
ihr
|
||||
ihre
|
||||
ihrem
|
||||
ihren
|
||||
ihrer
|
||||
ihres
|
||||
im
|
||||
immer
|
||||
in
|
||||
indem
|
||||
infolgedessen
|
||||
ins
|
||||
irgend
|
||||
ist
|
||||
j
|
||||
ja
|
||||
jahr
|
||||
jahre
|
||||
jahren
|
||||
je
|
||||
jede
|
||||
jedem
|
||||
jeden
|
||||
jeder
|
||||
jedermann
|
||||
jedermanns
|
||||
jedes
|
||||
jedoch
|
||||
jemand
|
||||
jemandem
|
||||
jemanden
|
||||
jene
|
||||
jenem
|
||||
jenen
|
||||
jener
|
||||
jenes
|
||||
jetzt
|
||||
k
|
||||
kam
|
||||
kann
|
||||
kannst
|
||||
kaum
|
||||
kein
|
||||
keine
|
||||
keinem
|
||||
keinen
|
||||
keiner
|
||||
keines
|
||||
kleine
|
||||
kleinen
|
||||
kleiner
|
||||
kleines
|
||||
kommen
|
||||
kommt
|
||||
konnte
|
||||
konnten
|
||||
kurz
|
||||
können
|
||||
könnt
|
||||
könnte
|
||||
l
|
||||
lang
|
||||
lange
|
||||
leicht
|
||||
leide
|
||||
lieber
|
||||
los
|
||||
m
|
||||
machen
|
||||
macht
|
||||
machte
|
||||
mag
|
||||
magst
|
||||
mahn
|
||||
mal
|
||||
man
|
||||
manche
|
||||
manchem
|
||||
manchen
|
||||
mancher
|
||||
manches
|
||||
mann
|
||||
mehr
|
||||
mein
|
||||
meine
|
||||
meinem
|
||||
meinen
|
||||
meiner
|
||||
meines
|
||||
mensch
|
||||
menschen
|
||||
mich
|
||||
mir
|
||||
mit
|
||||
mittel
|
||||
mochte
|
||||
mochten
|
||||
morgen
|
||||
muss
|
||||
musst
|
||||
musste
|
||||
mussten
|
||||
muß
|
||||
mußt
|
||||
möchte
|
||||
mögen
|
||||
möglich
|
||||
mögt
|
||||
müssen
|
||||
müsst
|
||||
müßt
|
||||
n
|
||||
na
|
||||
nach
|
||||
nachdem
|
||||
nahm
|
||||
natürlich
|
||||
neben
|
||||
nein
|
||||
neue
|
||||
neuen
|
||||
neun
|
||||
neunte
|
||||
neunten
|
||||
neunter
|
||||
neuntes
|
||||
nicht
|
||||
nichts
|
||||
nie
|
||||
niemand
|
||||
niemandem
|
||||
niemanden
|
||||
noch
|
||||
nun
|
||||
nur
|
||||
o
|
||||
ob
|
||||
oben
|
||||
oder
|
||||
offen
|
||||
oft
|
||||
ohne
|
||||
ordnung
|
||||
p
|
||||
q
|
||||
r
|
||||
recht
|
||||
rechte
|
||||
rechten
|
||||
rechter
|
||||
rechtes
|
||||
richtig
|
||||
rund
|
||||
s
|
||||
sa
|
||||
sache
|
||||
sagt
|
||||
sagte
|
||||
sah
|
||||
satt
|
||||
schlecht
|
||||
schluss
|
||||
schon
|
||||
sechs
|
||||
sechste
|
||||
sechsten
|
||||
sechster
|
||||
sechstes
|
||||
sehr
|
||||
sei
|
||||
seid
|
||||
seien
|
||||
sein
|
||||
seine
|
||||
seinem
|
||||
seinen
|
||||
seiner
|
||||
seines
|
||||
seit
|
||||
seitdem
|
||||
selbst
|
||||
sich
|
||||
sie
|
||||
sieben
|
||||
siebente
|
||||
siebenten
|
||||
siebenter
|
||||
siebentes
|
||||
sind
|
||||
so
|
||||
solang
|
||||
solche
|
||||
solchem
|
||||
solchen
|
||||
solcher
|
||||
solches
|
||||
soll
|
||||
sollen
|
||||
sollst
|
||||
sollt
|
||||
sollte
|
||||
sollten
|
||||
sondern
|
||||
sonst
|
||||
soweit
|
||||
sowie
|
||||
später
|
||||
startseite
|
||||
statt
|
||||
steht
|
||||
suche
|
||||
t
|
||||
tag
|
||||
tage
|
||||
tagen
|
||||
tat
|
||||
teil
|
||||
tel
|
||||
tritt
|
||||
trotzdem
|
||||
tun
|
||||
u
|
||||
uhr
|
||||
um
|
||||
und
|
||||
und?
|
||||
uns
|
||||
unse
|
||||
unsem
|
||||
unsen
|
||||
unser
|
||||
unsere
|
||||
unserer
|
||||
unses
|
||||
unter
|
||||
v
|
||||
vergangenen
|
||||
viel
|
||||
viele
|
||||
vielem
|
||||
vielen
|
||||
vielleicht
|
||||
vier
|
||||
vierte
|
||||
vierten
|
||||
vierter
|
||||
viertes
|
||||
vom
|
||||
von
|
||||
vor
|
||||
w
|
||||
wahr?
|
||||
wann
|
||||
war
|
||||
waren
|
||||
warst
|
||||
wart
|
||||
warum
|
||||
was
|
||||
weg
|
||||
wegen
|
||||
weil
|
||||
weit
|
||||
weiter
|
||||
weitere
|
||||
weiteren
|
||||
weiteres
|
||||
welche
|
||||
welchem
|
||||
welchen
|
||||
welcher
|
||||
welches
|
||||
wem
|
||||
wen
|
||||
wenig
|
||||
wenige
|
||||
weniger
|
||||
weniges
|
||||
wenigstens
|
||||
wenn
|
||||
wer
|
||||
werde
|
||||
werden
|
||||
werdet
|
||||
weshalb
|
||||
wessen
|
||||
wie
|
||||
wieder
|
||||
wieso
|
||||
will
|
||||
willst
|
||||
wir
|
||||
wird
|
||||
wirklich
|
||||
wirst
|
||||
wissen
|
||||
wo
|
||||
woher
|
||||
wohin
|
||||
wohl
|
||||
wollen
|
||||
wollt
|
||||
wollte
|
||||
wollten
|
||||
worden
|
||||
wurde
|
||||
wurden
|
||||
während
|
||||
währenddem
|
||||
währenddessen
|
||||
wäre
|
||||
würde
|
||||
würden
|
||||
x
|
||||
y
|
||||
z
|
||||
z.b
|
||||
zehn
|
||||
zehnte
|
||||
zehnten
|
||||
zehnter
|
||||
zehntes
|
||||
zeit
|
||||
zu
|
||||
zuerst
|
||||
zugleich
|
||||
zum
|
||||
zunächst
|
||||
zur
|
||||
zurück
|
||||
zusammen
|
||||
zwanzig
|
||||
zwar
|
||||
zwei
|
||||
zweite
|
||||
zweiten
|
||||
zweiter
|
||||
zweites
|
||||
zwischen
|
||||
zwölf
|
||||
über
|
||||
überhaupt
|
||||
übrigens
|
95963
synsets.xml
95963
synsets.xml
File diff suppressed because it is too large
Load Diff
610
testra.py
610
testra.py
|
@ -1,610 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
import re
|
||||
import time
|
||||
import json
|
||||
|
||||
#import spacy
|
||||
#import textacy
|
||||
from functools import reduce
|
||||
|
||||
import textacy
|
||||
|
||||
start = time.time()
|
||||
|
||||
import enchant
|
||||
|
||||
from datetime import datetime
|
||||
import os
|
||||
import xml.etree.ElementTree as ET
|
||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||
from miscellaneous import *
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"
|
||||
|
||||
|
||||
# load config
|
||||
config_ini = FILEPATH + "config.ini"
|
||||
|
||||
config = ConfigParser.ConfigParser()
|
||||
with open(config_ini) as f:
|
||||
config.read_file(f)
|
||||
|
||||
|
||||
PARSER=spacy.load("de")
|
||||
|
||||
|
||||
corpi = textacy.Corpus(PARSER)
|
||||
|
||||
testcontetn = [
|
||||
"fdsfdsfsd",
|
||||
"juzdtjlkö",
|
||||
"gfadojplk"
|
||||
]
|
||||
|
||||
testmetda = [
|
||||
{"categoryName":"zhb","Solution":"","Subject":"schulungstest"},
|
||||
{"categoryName":"neuanschluss","Solution":"subject","Subject":"telephone contract"},
|
||||
{"categoryName":"zhb","Solution":"","Subject":"setuji"}
|
||||
]
|
||||
|
||||
|
||||
def makecontent(testcontetn):
|
||||
for content in testcontetn:
|
||||
yield content
|
||||
|
||||
|
||||
def makemeta( testmetda):
|
||||
for metdata in testmetda:
|
||||
yield metdata
|
||||
|
||||
|
||||
def corpus2Text(corpus):
|
||||
for doc in corpus:
|
||||
yield doc.text
|
||||
|
||||
corpi.add_texts(
|
||||
makecontent(testcontetn),
|
||||
makemeta(testmetda)
|
||||
)
|
||||
|
||||
|
||||
save_corpus(corpi,corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/test",corpus_name="test")
|
||||
|
||||
bla = "uni mail account adresse woche falsch laufen schicken gerne januar betreff herr nachricht gruesse dezernat liebe datum freitag anfrage dienstag unicard karte abholen defekt bibliothek abholung dezember beantragung status gerne portal email nummer service id vorname prozess dez schauen eg rechner mitarbeiterin benutzerkonto oktober wissenschaftliche projekt fr download hilfskraft verantwortliche link dringend antrag schnelle arbeitsplatz november admin rahmen stand geschickt server outlook ordner bild konto postfach campus hi ueberpruefung sued beste daten freuen semester login benutzer gerne erstellen stelle frage system boss moeglichkeit student schoen spam alias geld vertrag juni ansprechpartner telefon raum einrichtung gebaeude telefonbuch abteilung element eintrag nutzer raum pc gerne lehrstuhl voraus fakultaet verfuegung herzliche drucker erreichen tlaptop kabel problem klaerung url adapter feedback koeln grundsaetzlich kaufmann problem fehler verbindung anhang meldung client netz netzwerk wenden funktionieren liebe mitarbeiter unterstuetzung aktuell herr benoetigt raumplanung gb weber vorab ueckmeldung software lizenz programm kurze urlaub gerne installation dankbar informieren team service problem loesung bestellung verlaengern verteiler alte aendern februar oeffnen update pdf browser notwendig fenster schulung beginn wege nord tkurs frage studierende personen teilnehmer standort gerne herunterladen voraus zusenden ews veranstaltung datei iso text umstellung absender message date html arbeit kaiser erfolgreich thema ablauf art at einfuehrung umfrage cloud zugang zugreifen montag probleme kollegin profil server handy web file ticket drucker einrichten senden nr mittwoch card mitteilen nrw kontakt mail fax universitaet it institut hardware hinweis fakultaet not strasse loeschen liste funktion auftrag zeitraum verwaltung angebot vorgehen entfernen moeglichkeit gefunden benutzername informatik gruppe eingabe nachname chemie dame b. angepasst name schoene abt post zukommen verlaengerung sommersemester fehlen namensaenderung auskunft tu dr prof pruefung herr namen fakultaet bereich lehrstuhl installieren buero ok anschluss maerz theologie notebook herr berechtigung master vorbeikommen passwort anmelden account hilfe helfen uniaccount anmeldung kennwort problem boss zugriff referat screenshot support laufwerk bildschirm super tastatur button auswaehlen"
|
||||
bla = bla.split()
|
||||
print(len(bla))
|
||||
print(len(set(bla)))
|
||||
print()
|
||||
|
||||
x = {'a':1, 'b': 2}
|
||||
y = {'b':10, 'c': 11}
|
||||
z = x.update(y)
|
||||
|
||||
print(x)
|
||||
|
||||
"""
|
||||
#save_corpusV2(corpi,corpus_path=corpus_de_path,corpus_name=rawCorpus_name)
|
||||
|
||||
#textacy.fileio.write_file_lines(corpus2Text(corpi), filepath=corpus_de_path+"plain.txt")
|
||||
|
||||
|
||||
dict = {"unicard redaktionsteam": 189, "kms": 131, "itmc_st\u00f6rungen": 17, "benutzerverwaltung_probleme": 168, "mailverteiler exchange": 130, "beamer": 70, "cws_confluence": 190, "benutzerverwaltung": 26, "sos": 166, "virtuelle server": 116, "sap": 7, "wlan": 21, "lsf": 6, "gastaufenthalt": 8, "umzug": 5, "firewall betreuung": 129, "ausleihe": 39, "fiona": 10, "kursplanung": 195, "schulungsraum verwaltung": 200, "plagiatserkennung": 32, "designentwicklung": 100, "ub basis it": 184, "tsm": 51, "backup tsm": 110, "raumkalender": 174, "veeam": 149, "linux bs": 42, "hochleistungsrechnen": 90, "e learning": 37, "h\u00f6rsaal\u00fcbertragung": 52, "sophos": 88, "service portal redaktion": 182, "verkauf": 93, "fk 16": 30, "campus app": 54, "dns": 71, "kurse": 196, "itmc schulungsr\u00e4ume": 96, "leitung": 91, "telefon": 14, "housing": 135, "softwarelizenzen": 35, "hcm stammdaten": 68, "semesterticket": 197, "exchange nutzung": 33, "mediendienste": 167, "sam spider": 172, "pvp": 27, "webserver": 29, "werkvertr\u00e4ge": 158, "ibz raumbuchung": 177, "webmailer": 126, "unicard sperrung": 64, "cd dvd produktion": 114, "lizenzserver": 92, "pr\u00fcfungsmanagement": 38, "blogs wikis foren": 87, "unicard ausgabe": 161, "pools": 157, "desktop & basisdienste": 144, "antrag auf rechnungserstellung": 193, "mailalias": 121, "evaexam": 133, "neuanschluss": 0, "mobilfunkvertr\u00e4ge": 69, "ftp server": 191, "haustechnik": 77, "raumbuchungssysteme": 186, "confluence": 181, "uniaccount zugangsdaten": 47, "itmc medienr\u00e4ume ef50": 171, "dokoll support": 128, "elektronisches telefonbuch": 3, "softwareverteilung": 153, "overhead projektor": 104, "sicherheit": 145, "itmc_als": 48, "itmc pools": 160, "zhb": 60, "serversupport": 101, "veranstaltungen": 61, "fk12 webauftritt": 138, "hardware": 142, "unicard produktion": 156, "telefonkonferenzen": 170, "dhcp": 188, "zertifikate server dfn": 139, "lan": 1, "datanet": 49, "neuausstattung": 173, "moodle": 16, "abmeldung": 13, "uni mail": 15, "medienr\u00e4ume ef50": 117, "verschiedene aufgaben": 40, "zentrale webserver": 75, "vorlesungsaufzeichnung": 152, "grafik": 132, "campus management": 72, "hacker angriff": 46, "pos": 23, "zugangsdaten": 41, "serviceportal": 63, "ews": 24, "voicemail box": 150, "service desk itmc": 74, "test": 180, "beschaffung": 57, "bestellung": 185, "vpn": 55, "app feedback": 66, "allgemein": 134, "rundmail": 105, "telefonabrechnung": 199, "limesurvey": 31, "unicard": 28, "eldorado": 140, "uniaccount": 12, "plotter": 125, "mdm mobile device management": 120, "namens\u00e4nderung": 43, "sd": 84, "basis applikationen": 103, "\u00e4nderung": 194, "fileserver einrichtung": 187, "fk14_test": 154, "werkst\u00e4tte": 179, "itmc_aufgaben": 45, "formulare antr\u00e4ge": 81, "facility": 192, "web": 169, "asknet": 136, "server storage": 113, "mail groupware": 20, "rektorat -b\u00fcro": 178, "office": 50, "werkstoffe lehrstuhl bauwesen": 59, "telefonzentrale": 115, "verwaltung": 4, "netze": 22, "beantragung": 82, "d.3 dms": 148, "redmine projektverwaltung": 141, "wsus": 106, "lido": 118, "rechnerr\u00e4ume": 143, "matrix42_hilfe": 18, "boss service desk": 44, "konteneinsicht": 62, "spam phishing": 53, "forensic": 164, "fk 12": 11, "benutzungsverwaltung": 198, "redmine": 79, "basis app": 85, "viren": 95, "fk12 migration": 155, "raumbuchung": 109, "virtuelle desktops citrix": 176, "outlook_einrichtung": 123, "kundenserver": 137, "nrw ticket": 80, "weiterentwicklung": 127, "siport zugangskontrolle": 98, "e mail dienste": 99, "vorlagenerstellung": 36, "video": 19, "studierendensekretariat": 111, "it sicherheit sic": 86, "boss": 25, "technik": 58, "dokoll pvp": 112, "betrieb": 2, "v2 campus app feedback": 151, "mailverteiler": 108, "videoschnitt": 119, "fk raumplanung 09": 9, "sap urlaub": 73, "keine r\u00fcckantwort": 124, "prozess- und projektmanagement": 67, "dienstreise": 34, "webgestaltung": 78, "schulung": 175, "software": 89, "medientechnik": 76, "servicedesk": 107, "service portal": 94, "software entwicklung": 165, "uniflow": 159, "ub_st\u00f6rungen": 162, "fk15": 183, "uhren": 83, "entwicklung": 163, "videokonferenzen": 97, "itmc webauftritt": 102, "joomla itmc website": 147, "changes": 122, "visitenkartenproduktion": 65, "lizenzmanagement": 146, "tonerb\u00f6rse": 201, "arbeitsplatzsupport": 56}
|
||||
|
||||
list = [(key,value) for key,value in dict.items()]
|
||||
|
||||
list.sort(key=lambda tup : tup[1])
|
||||
"""
|
||||
"""
|
||||
from spacy.tokens.doc import Doc as SpacyDoc
|
||||
|
||||
filepath = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/de_clean_ticket_content.bin"
|
||||
|
||||
# load parser
|
||||
parser = spacy.load("de")
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
||||
|
||||
stringstorepath = corpus_path + 'de_parser/vocab/strings.json'
|
||||
with open(stringstorepath) as file:
|
||||
parser.vocab.strings.load(file)
|
||||
|
||||
vocabpath = Path(corpus_path + 'de_parser/vocab/lexemes.bin')
|
||||
parser.vocab.load_lexemes(vocabpath)
|
||||
|
||||
spacy_vocab = parser.vocab
|
||||
|
||||
def readCorpus(filepath):
|
||||
with open_sesame(filepath, mode='rb') as f:
|
||||
for bytes_string in SpacyDoc.read_bytes(f):
|
||||
yield SpacyDoc(spacy_vocab).from_bytes(bytes_string).text
|
||||
|
||||
|
||||
textacy.fileio.write_file_lines(readCorpus(filepath),"/home/jannis.grundmann/PycharmProjects/topicModelingTickets/result.txt")
|
||||
"""
|
||||
|
||||
|
||||
|
||||
# load raw corpus and create new one
|
||||
#raw_corpus, parser = load_corpusV2(corpus_name=rawCorpus_name, corpus_path=corpus_de_path)
|
||||
|
||||
#printRandomDoc(raw_corpus)
|
||||
|
||||
|
||||
"""
|
||||
spacy_doc = PARSER("test")
|
||||
save_obj(spacy_doc, "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
|
||||
|
||||
spacy_doc2 = load_obj("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/doc.pkl")
|
||||
|
||||
print("Doc: {0}".format(spacy_doc2))
|
||||
|
||||
|
||||
|
||||
jgibbsLLDA_root = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/"
|
||||
|
||||
LLDA_filepath = "{0}labeldict.txt".format(jgibbsLLDA_root)
|
||||
laveldict = {'fiona': 10, 'vorlagenerstellung': 36, 'webserver': 29, 'matrix42_hilfe': 18, 'sap': 7, 'pos': 23, 'verwaltung': 4, 'lan': 1}
|
||||
with open(LLDA_filepath, 'w') as file:
|
||||
file.write(json.dumps(laveldict))
|
||||
"""
|
||||
"""
|
||||
def load_corpus(corpus_path, corpus_name, lang="de"):
|
||||
from pathlib import Path
|
||||
|
||||
# load parser
|
||||
parser = spacy.load(lang)
|
||||
|
||||
stringstorepath = corpus_path + str(lang) + '_parser'+'/vocab/strings.json'
|
||||
with open(stringstorepath) as file:
|
||||
parser.vocab.strings.load(file)
|
||||
|
||||
vocabpath = Path(corpus_path + str(lang) + '_parser'+'/vocab/lexemes.bin')
|
||||
parser.vocab.load_lexemes(vocabpath)
|
||||
|
||||
corpus = textacy.Corpus(parser)
|
||||
|
||||
|
||||
contentpath = corpus_path + corpus_name + "_content.bin"
|
||||
metapath = corpus_path + corpus_name + "_meta.json"
|
||||
|
||||
|
||||
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpus.spacy_vocab, contentpath)
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpus.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
|
||||
return corpus
|
||||
"""
|
||||
|
||||
|
||||
"""
|
||||
# THESAURUS
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries_small.xml"
|
||||
lexicalentries = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/lexicalentries.xml"
|
||||
synsets = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/synsets.xml"
|
||||
|
||||
def build_thesaurus(path2lexicalentries):#, path2synsets):
|
||||
lextree = ET.parse(path2lexicalentries, ET.XMLParser(encoding="utf-8"))
|
||||
#syntree = ET.parse(path2synsets, ET.XMLParser(encoding="utf-8"))
|
||||
|
||||
lexroot = lextree.getroot()
|
||||
#synroot = syntree.getroot()
|
||||
|
||||
|
||||
word2synsets = {}
|
||||
template = {"w1": ["s1", "s2"]}
|
||||
|
||||
for ro in lexroot:
|
||||
for elem in ro:
|
||||
if elem.tag == "LexicalEntry":
|
||||
lex_dictlist = [subentry.attrib for subentry in elem]
|
||||
|
||||
|
||||
|
||||
synlist = []
|
||||
string = "WORD"
|
||||
|
||||
for lex_dict in lex_dictlist:
|
||||
if "synset" in lex_dict.keys():
|
||||
|
||||
synset = lex_dict["synset"]
|
||||
synlist.append(synset)
|
||||
|
||||
if 'writtenForm' in lex_dict.keys():
|
||||
string = (lex_dict["writtenForm"])
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
# alle punkte raus
|
||||
string = re.sub(r'[.]', "", string)
|
||||
|
||||
# alles in klammern raus
|
||||
string = re.sub(r"\((.*)\)", " ", string)
|
||||
|
||||
# längeres leerzeichen normalisieren
|
||||
string = textacy.preprocess.normalize_whitespace(string)
|
||||
|
||||
string = string.lower().strip()
|
||||
|
||||
word2synsets[string] = synlist
|
||||
|
||||
synset2Words = {}
|
||||
template = {"s1": ["w1","w2"]}
|
||||
|
||||
for word,synset in word2synsets.items():
|
||||
for syn in synset:
|
||||
if syn not in synset2Words.keys():
|
||||
synset2Words[syn] = [word]
|
||||
else:
|
||||
synset2Words[syn].append(word)
|
||||
|
||||
# nach anzhal der wörter in den strings sortieren
|
||||
for synset in word2synsets.values():
|
||||
synset.sort(key=lambda x: len(x.split()))
|
||||
|
||||
thesaurus = {}
|
||||
thesaurus_template = {"w1" : "mainsyn"}
|
||||
|
||||
for word,synset in word2synsets.items():
|
||||
try:
|
||||
thesaurus[word] = synset2Words[synset[0]][0] #Ann.: erstes synonym ist das Hauptsynonym
|
||||
except:
|
||||
pass
|
||||
return thesaurus
|
||||
|
||||
|
||||
for r in synroot:
|
||||
for element in r:
|
||||
|
||||
if element.tag == "Synset":
|
||||
synset = []
|
||||
attrib = element.attrib
|
||||
id = attrib["id"]
|
||||
|
||||
if id not in synset2Words.keys():
|
||||
synset2Words[id] = "WORD"
|
||||
|
||||
"""
|
||||
|
||||
"""
|
||||
from postal.parser import parse_address
|
||||
|
||||
|
||||
address = "Nicolas Rauner LS Biomaterialien und Polymerwissenschaften Fakultät Bio- und Chemieingenieurwesen TU Dortmund D-44227 Dortmund Tel: + 49-(0)231 / 755 - 3015 Fax: + 49-(0)231 / 755 - 2480"
|
||||
print(parse_address(address))
|
||||
|
||||
|
||||
address = "Technische Universität Dortmund Maschinenbau/Lehrstuhl für Förder- und Lagerwesen LogistikCampus Joseph-von-Fraunhofer-Str. 2-4 D-44227 Dortmund "
|
||||
print(parse_address(address))
|
||||
"""
|
||||
|
||||
"""
|
||||
|
||||
corpus_path = "/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/"
|
||||
corpus_name = "testcorpus"
|
||||
|
||||
|
||||
#corpi.save(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||
#corpi = textacy.Corpus.load(corpus_path, name=corpus_name, compression=corpus_compression)
|
||||
|
||||
|
||||
|
||||
import pathlib
|
||||
|
||||
strings_path = pathlib.Path(corpus_path + 'strings.json')
|
||||
path_lexemes_bin_ = pathlib.Path(corpus_path + 'lexemes.bin')
|
||||
|
||||
PARSER.vocab.dump(path_lexemes_bin_)
|
||||
nlp.vocab.load_lexemes(path_lexemes_bin_)
|
||||
|
||||
|
||||
def save_corpus(corpus_path,corpus_name):
|
||||
|
||||
# save stringstore
|
||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||
with open(stringstore_path, "w") as file:
|
||||
PARSER.vocab.strings.dump(file)
|
||||
|
||||
|
||||
#save content
|
||||
contentpath = corpus_path + corpus_name+ "_content.bin"
|
||||
textacy.fileio.write_spacy_docs((doc.spacy_doc for doc in corpi),contentpath)
|
||||
|
||||
|
||||
#save meta
|
||||
metapath = corpus_path + corpus_name +"_meta.json"
|
||||
textacy.fileio.write_json_lines((doc.metadata for doc in corpi), metapath)
|
||||
|
||||
|
||||
|
||||
def load_corpus(corpus_path,corpus_name):
|
||||
# load new lang
|
||||
nlp = spacy.load("de")
|
||||
|
||||
#load stringstore
|
||||
stringstore_path = corpus_path + corpus_name + '_strings.json'
|
||||
with open(stringstore_path,"r") as file:
|
||||
nlp.vocab.strings.load(file)
|
||||
|
||||
# define corpi
|
||||
corpi = textacy.Corpus(nlp)
|
||||
|
||||
# load meta
|
||||
metapath = corpus_path + corpus_name +"_meta.json"
|
||||
metadata_stream = textacy.fileio.read_json_lines(metapath)
|
||||
|
||||
#load content
|
||||
contentpath = corpus_path + corpus_name+ "_content.bin"
|
||||
spacy_docs = textacy.fileio.read_spacy_docs(corpi.spacy_vocab, contentpath)
|
||||
|
||||
for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
|
||||
corpi.add_doc(
|
||||
textacy.Doc(spacy_doc, lang=corpi.spacy_lang, metadata=metadata))
|
||||
|
||||
return corpi
|
||||
|
||||
|
||||
save_corpus(corpus_path,corpus_name)
|
||||
|
||||
print(load_corpus(corpus_path,corpus_name))
|
||||
|
||||
"""
|
||||
|
||||
"""
|
||||
def normalizeSynonyms(default_return_first_Syn=False, parser=PARSER):
|
||||
#return lambda doc : parser(" ".join([tok.lower_ for tok in doc]))
|
||||
return lambda doc : parser(" ".join([getFirstSynonym(tok.lower_, THESAURUS, default_return_first_Syn=default_return_first_Syn) for tok in doc]))
|
||||
|
||||
def getFirstSynonym(word, thesaurus, default_return_first_Syn=False):
|
||||
if not isinstance(word, str):
|
||||
return str(word)
|
||||
|
||||
word = word.lower()
|
||||
|
||||
# durch den thesaurrus iterieren
|
||||
for syn_block in thesaurus: # syn_block ist eine liste mit Synonymen
|
||||
|
||||
for syn in syn_block:
|
||||
syn = syn.lower()
|
||||
if re.match(r'\A[\w-]+\Z', syn): # falls syn einzelwort ist
|
||||
if word == syn:
|
||||
return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
|
||||
else: # falls es ein satz ist
|
||||
if word in syn:
|
||||
return str(getHauptform(syn_block, word, default_return_first_Syn=default_return_first_Syn))
|
||||
return str(word) # zur Not, das ursrpüngliche Wort zurückgeben
|
||||
|
||||
def getHauptform(syn_block, word, default_return_first_Syn=False):
|
||||
for syn in syn_block:
|
||||
syn = syn.lower()
|
||||
|
||||
if "hauptform" in syn and len(syn.split(" ")) <= 2:
|
||||
# nicht ausgeben, falls es in Klammern steht#todo gibts macnmal?? klammern aus
|
||||
for w in syn.split(" "):
|
||||
if not re.match(r'\([^)]+\)', w):
|
||||
return w
|
||||
|
||||
if default_return_first_Syn:
|
||||
# falls keine hauptform enthalten ist, das erste Synonym zurückgeben, was kein satz ist und nicht in klammern steht
|
||||
for w in syn_block:
|
||||
if not re.match(r'\([^)]+\)', w):
|
||||
return w
|
||||
return word # zur Not, das ursrpüngliche Wort zurückgeben
|
||||
"""
|
||||
|
||||
"""
|
||||
path2xml="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deWordNet.xml"
|
||||
|
||||
tree = ET.parse(path2xml, ET.XMLParser(encoding="utf-8"))
|
||||
root = tree.getroot()
|
||||
|
||||
for r in root:
|
||||
for element in r:
|
||||
|
||||
if element.tag == "Synset":
|
||||
attrib = element.attrib
|
||||
for i,subentry in enumerate(element):
|
||||
if subentry.tag == "Lemma" and subentry.attrib["partOfSpeech"] == "n":
|
||||
string = (subentry.attrib["writtenForm"])
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
# seperate_words_on_regex:
|
||||
string = " ".join(re.compile(regex_specialChars).split(string))
|
||||
string_list=string.split()
|
||||
if len(string_list) == 1:
|
||||
nomen.append(string.lower().strip())
|
||||
"""
|
||||
|
||||
"""
|
||||
import re
|
||||
from collections import Counter
|
||||
|
||||
def words(text): return re.findall(r'\w+', text.lower())
|
||||
|
||||
WORDS = Counter(words(open('/home/jannis.grundmann/PycharmProjects/topicModelingTickets/deu_news_2015_1M-sentences.txt').read()))
|
||||
|
||||
def P(word, N=sum(WORDS.values())):
|
||||
"Probability of `word`."
|
||||
return WORDS[word] / N
|
||||
|
||||
def correction(word):
|
||||
"Most probable spelling correction for word."
|
||||
return max(candidates(word), key=P)
|
||||
|
||||
def candidates(word):
|
||||
"Generate possible spelling corrections for word."
|
||||
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
|
||||
|
||||
def known(words):
|
||||
"The subset of `words` that appear in the dictionary of WORDS."
|
||||
return set(w for w in words if w in WORDS)
|
||||
|
||||
def edits1(word):
|
||||
"All edits that are one edit away from `word`."
|
||||
letters = 'abcdefghijklmnopqrstuvwxyz'
|
||||
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
|
||||
deletes = [L + R[1:] for L, R in splits if R]
|
||||
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
|
||||
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
|
||||
inserts = [L + c + R for L, R in splits for c in letters]
|
||||
return set(deletes + transposes + replaces + inserts)
|
||||
|
||||
def edits2(word):
|
||||
"All edits that are two edits away from `word`."
|
||||
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
|
||||
|
||||
"""
|
||||
|
||||
"""
|
||||
### extract from derewo
|
||||
|
||||
#http://www1.ids-mannheim.de/kl/projekte/methoden/derewo.html
|
||||
|
||||
|
||||
raw = textacy.fileio.read_file_lines("DeReKo-2014-II-MainArchive-STT.100000.freq")
|
||||
|
||||
for line in raw:
|
||||
line_list=line.split()
|
||||
if line_list[2] == "NN":
|
||||
string = line_list[1].lower()
|
||||
|
||||
# replaceRockDots
|
||||
string = re.sub(r'[ß]', "ss", string)
|
||||
string = re.sub(r'[ö]', "oe", string)
|
||||
string = re.sub(r'[ü]', "ue", string)
|
||||
string = re.sub(r'[ä]', "ae", string)
|
||||
|
||||
|
||||
nomen.append(string.lower().strip())
|
||||
|
||||
|
||||
textacy.fileio.write_file_lines(nomen,"nomen2.txt")
|
||||
"""
|
||||
|
||||
"""
|
||||
stream = textacy.fileio.read_csv("/home/jannis.grundmann/PycharmProjects/topicModelingTickets/M42-Export/Tickets_2017-09-13.csv", delimiter=";")
|
||||
content_collumn_name = "Description"
|
||||
content_collumn = 9 # standardvalue
|
||||
|
||||
de_tickets=[]
|
||||
en_tickets=[]
|
||||
misc_tickets=[]
|
||||
|
||||
error_count = 0
|
||||
for i, lst in enumerate(stream):
|
||||
if i == 0:
|
||||
de_tickets.append(lst)
|
||||
en_tickets.append(lst)
|
||||
misc_tickets.append(lst)
|
||||
else:
|
||||
try:
|
||||
content_collumn_ = lst[content_collumn]
|
||||
if detect(content_collumn_) == "de":
|
||||
de_tickets.append(lst)
|
||||
elif detect(content_collumn_) == "en":
|
||||
en_tickets.append(lst)
|
||||
else:
|
||||
misc_tickets.append(lst)
|
||||
|
||||
except:
|
||||
misc_tickets.append(lst)
|
||||
error_count += 1
|
||||
|
||||
print(error_count)
|
||||
|
||||
textacy.fileio.write_csv(de_tickets,"M42-Export/de_tickets.csv", delimiter=";")
|
||||
textacy.fileio.write_csv(en_tickets,"M42-Export/en_tickets.csv", delimiter=";")
|
||||
textacy.fileio.write_csv(misc_tickets,"M42-Export/misc_tickets.csv", delimiter=";")
|
||||
|
||||
|
||||
"""
|
||||
|
||||
"""
|
||||
regex_specialChars = r'[`\-=~!#@,.$%^&*()_+\[\]{};\'\\:"|</>?]'
|
||||
|
||||
|
||||
def stringcleaning(stringstream, funclist):
|
||||
for string in stringstream:
|
||||
for f in funclist:
|
||||
string = f(string)
|
||||
yield string
|
||||
|
||||
|
||||
def seperate_words_on_regex(regex=regex_specialChars):
|
||||
return lambda string: " ".join(re.compile(regex).split(string))
|
||||
|
||||
|
||||
words = [
|
||||
"uniaccount",
|
||||
"nr54065467",
|
||||
"nr54065467",
|
||||
"455a33c5,"
|
||||
"tvt?=",
|
||||
"tanja.saborowski@tu-dortmund.de",
|
||||
"-",
|
||||
"m-sw1-vl4053.itmc.tu-dortmund.de",
|
||||
"------problem--------"
|
||||
]
|
||||
|
||||
|
||||
|
||||
topLVLFinder = re.compile(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', re.IGNORECASE)
|
||||
specialFinder = re.compile(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]', re.IGNORECASE)
|
||||
|
||||
for s in stringcleaning((w for w in words),[seperate_words_on_regex()]):
|
||||
print(s.strip())
|
||||
|
||||
#print(stringcleaning(w,string_comp))
|
||||
#print(bool(re.search(r'\.[a-z]{2,3}(\.[a-z]{2,3})?',w)))
|
||||
#print(bool(re.search(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]',w)))
|
||||
#result = specialFinder.sub(" ", w)
|
||||
#print(re.sub(r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./>?]'," ",w))
|
||||
|
||||
#print(re.sub(r'\.[a-z]{2,3}(\.[a-z]{2,3})?', " ", w))
|
||||
"""
|
||||
|
||||
"""
|
||||
def replaceRockDots():
|
||||
return lambda string: re.sub(r'[ß]', "ss", (re.sub(r'[ö]', "oe", (re.sub(r'[ü]', "ue", (re.sub(r'[ä]', "ae", string.lower())))))))
|
||||
|
||||
|
||||
|
||||
de_stop_words = list(textacy.fileio.read_file_lines(filepath="german_stopwords_full.txt"))
|
||||
|
||||
|
||||
#blob = Text(str(textacy.fileio.read_file("teststring.txt")))#,parser=PatternParser(pprint=True, lemmata=True))
|
||||
|
||||
#print(blob.entities)
|
||||
|
||||
de_stop_words = list(map(replaceRockDots(),de_stop_words))
|
||||
#LEMMAS = list(map(replaceRockDots(),LEMMAS))
|
||||
#VORNAMEN = list(map(replaceRockDots(),VORNAMEN))
|
||||
|
||||
de_stop_words = list(map(textacy.preprocess.normalize_whitespace,de_stop_words))
|
||||
#LEMMAS = list(map(textacy.preprocess.normalize_whitespace,LEMMAS))
|
||||
#VORNAMEN = list(map(textacy.preprocess.normalize_whitespace,VORNAMEN))
|
||||
|
||||
|
||||
|
||||
|
||||
#textacy.fileio.write_file_lines(LEMMAS,"lemmas.txt")
|
||||
#textacy.fileio.write_file_lines(VORNAMEN,"firstnames.txt")
|
||||
textacy.fileio.write_file_lines(de_stop_words,"german_stopwords.txt")
|
||||
|
||||
"""
|
||||
|
||||
end = time.time()
|
||||
print("\n\n\nTime Elapsed Topic:{0}\n\n".format(end - start))
|
||||
|
||||
|
||||
|
|
@ -1,227 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<verzeichnis>
|
||||
<ticket>
|
||||
<Zusammenfassung>Telephone Contract</Zusammenfassung>
|
||||
<Kategorie>Neuanschluss</Kategorie>
|
||||
<Beschreibung>
|
||||
Telefon-Neuanschluss
|
||||
|
||||
Antragsteller:
|
||||
|
||||
Melanie Hinrichs
|
||||
|
||||
melanie.hinrichs@tu-dortmund.de
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Terminvorschlag unbestimmt
|
||||
|
||||
Einrichtung Dezernat 3
|
||||
|
||||
Abteilung Abteilung 2
|
||||
|
||||
PSP Element L-11-10000-100-302300
|
||||
|
||||
UniAccount myvowest(Westerdorf, Yvonne)
|
||||
|
||||
Gebäude Pavillon 8
|
||||
|
||||
Raum ID 031 (63292)
|
||||
|
||||
Telefondose keine vorhanden
|
||||
|
||||
Telefonnr. -
|
||||
|
||||
Eintrag Telefonbuch
|
||||
|
||||
E-Mail melanie.hinrichs@tu-dortmund.de
|
||||
|
||||
Voicemail Nicht erwünscht
|
||||
|
||||
Ansprechpartner Melanie Hinrichs
|
||||
|
||||
Tel. Ansprechpartner 5848
|
||||
|
||||
Verantwortlicher Nutzer -
|
||||
|
||||
Type Amt
|
||||
|
||||
Bemerkung:
|
||||
|
||||
Es wird ein Telefon benötigt,ein Telefon mit 6 Speicherpl.f.die Gruppenfunktion ist ausreichend. Die Möbel werden am 10.06.2015 aufgestellt.Weder Netzwerkdose noch Telefondose vorhanden. Dez.6 hat Vorbereitungen getroffen.
|
||||
</Beschreibung>
|
||||
<Loesung>Frau Hinrichs überdenkt die Situation und macht dann neue Anträge.
|
||||
Dieses Ticket wird geschlossen</Loesung>
|
||||
</ticket>
|
||||
<ticket>
|
||||
<Zusammenfassung>LSF/BOSS Datenexport</Zusammenfassung>
|
||||
<Kategorie>LSF</Kategorie>
|
||||
<Beschreibung>Sehr geehrter ITMC Service,
|
||||
|
||||
ich muss mir jedes Semester meine Veranstaltungen für das kommende
|
||||
|
||||
Semester zusammen suchen und stelle mir die Frage "Wie weit bin ich mit
|
||||
|
||||
meinem Studium? Welche Module kann ich wann belegen?". Gerade bei
|
||||
|
||||
mehreren Wahlmodulen gestaltet sich dies ja doch etwas schwieriger.
|
||||
|
||||
Daher möchte ich gerne, zunächst als experimentelle Privatprojekt, eine
|
||||
|
||||
leichtgewichtige Webseite erstellen, die mir dabei helfen soll. Meine
|
||||
|
||||
Vision ist, dies in weiteren Stufen meinen Kommilitonen der
|
||||
|
||||
Informatik-Fakultät und danach allen Studierenden zu Verfügung zu stellen.
|
||||
|
||||
Statt "das Rad neu zu erfinden" möchte ich einfach eine andere
|
||||
|
||||
Sichtweise auf die Daten in LSF und BOSS schaffen.
|
||||
|
||||
Zentraler Aspekt ist hier der Studienplan des Studiengangs, der
|
||||
|
||||
gleichzeitig eine Übersicht über noch zu erledigende Module und die
|
||||
|
||||
aktuelle Gesamt-Creditzahl liefern soll. Diese Ansicht soll auch dazu
|
||||
|
||||
dienen festzulegen, in welchem Semester man welches Modul machen möchte.
|
||||
|
||||
Darauf aufbauend möchte ich in einer nächsten Stufe gerne detaillierte
|
||||
|
||||
Veranstaltungsinformationen, ähnlich dem LSF, in einer
|
||||
|
||||
Facettensuche(ähnlich anzeigen. Dadurch sollen diese nach Studiengang,
|
||||
|
||||
Fakultät, Dozent, Semester, Turnus, etc. durchsuchbar werden.
|
||||
|
||||
Um den Studenten eine Orientierung zu liefern, wäre es zudem in einer
|
||||
|
||||
zukünftigen Version vorstellbar, den Studenten anhand der Studienpläne
|
||||
|
||||
und Modulabhängigkeiten (vorausgesetzte/erwünschte Kentnisse)
|
||||
|
||||
Veranstaltungen für das kommende Semester vorzuschlagen und
|
||||
|
||||
automatisiert Stundenpläne zu erstellen.
|
||||
|
||||
Daher möchte ich erfragen, ob
|
||||
|
||||
- es möglich ist einen Datenbank-Dump der Veranstaltungs-Basisdaten(z.B.
|
||||
|
||||
Titel, Dozent, Turnus, Uhrzeit, Beschreibung, etc.) zu erhalten
|
||||
|
||||
- das LSF und/oder das BOSS eine Programmierschnittstelle zur
|
||||
|
||||
Datenabfrage haben, welche nicht auf einem Login mit
|
||||
|
||||
Benutzername/Passwort basiert
|
||||
|
||||
- es möglich ist für einzelne Benutzer mit deren Erlaubnis eine Liste
|
||||
|
||||
aller Studienleistungen inkl. Veranstaltungs/BOSS-Nummer in einem
|
||||
|
||||
maschinenlesbaren Format (z.B. CSV oder XML, nicht PDF) abzufragen
|
||||
|
||||
Falls Sie noch offene Fragen haben, lassen Sie es mich wissen. Gerne
|
||||
|
||||
können wir diese auch in einem persönlichen Gespräch klären.
|
||||
|
||||
Vielen Dank!
|
||||
|
||||
|
||||
|
||||
Mit freundlichen Grüßen,
|
||||
|
||||
Tobias Brennecke
|
||||
</Beschreibung>
|
||||
<Loesung>alt</Loesung>
|
||||
</ticket>
|
||||
<ticket>
|
||||
<Zusammenfassung>Zurücksetzung Passwort BOSS</Zusammenfassung>
|
||||
<Kategorie>ITMC_Störungen</Kategorie>
|
||||
<Beschreibung>Hallo.
|
||||
|
||||
Bitte setzen Sie mein Passwort zurück.
|
||||
Ich würde gerne eine neues wählen.
|
||||
|
||||
Mit freundlichen Grüßen,
|
||||
|
||||
Ahmann.
|
||||
|
||||
IMAP0013 OK Completed (0.000 sec
|
||||
IMAP0013 OK Completed (0.000 sec
|
||||
IMAP0013 OK Completed (0.000 sec</Beschreibung>
|
||||
<Loesung>können Sie sich im Service Portal einloggen?
|
||||
|
||||
Wenn ja, dann löschen Sie Ihre Cookies und den Cache.
|
||||
Anschließend sollte auch die BOSS Anmeldung klappen.
|
||||
|
||||
Verwenden Sie Firefox oder Chrome.
|
||||
|
||||
Achten Sie darauf, dass der Account klein geschrieben ist, wenn sie sich mit einem Mobilgerät einloggen.
|
||||
|
||||
Sollte die Anmeldung im Service Portal nicht funktionieren, dann können Sie persönlich im Service Desk vorbeikommen und gegen Vorlage Ihres Personalausweises/Unicard Ihre Anmelde-Daten erhalten. Auch können wir Ihnen Ihre Zugangsdaten per Post zuschicken. Dazu müssen Sie allerdings ein paar Sicherheitsfragen beantworten:
|
||||
|
||||
1. Wie lautet Ihr Unimail-Namenskürzel (beginnend mit 'm' oder 'sm')
|
||||
|
||||
2. Wie lautet Ihre Matrikel-Nummer?
|
||||
|
||||
3. Wie lautet Ihr Geburtsdatum?
|
||||
|
||||
4. Wie lautet Ihre hinterlegte Post-Adresse?
|
||||
|
||||
5. Wie lautet die Antwort auf Ihre Sicherheitsfrage Geburtsname der Mutter?
|
||||
|
||||
6. Wie lautet Ihre aktuelle Post-Adresse?
|
||||
</Loesung>
|
||||
</ticket>
|
||||
<ticket>
|
||||
<Zusammenfassung>Forschungsantrag - Geräteanfrage</Zusammenfassung>
|
||||
<Kategorie>Video</Kategorie>
|
||||
<Beschreibung>Sehr geehrtes ITMC-Team,
|
||||
|
||||
für einen Forschungsantrag benötige ich einige technische Informationen
|
||||
und bitte Sie herzlich um Unterstützung:
|
||||
|
||||
Zur Unterrichtsaufzeichnung möchte ich gern mit GoPro-Kameras arbeiten.
|
||||
Ich möchte 4 Kameras beantragen. Könnten Sie mich beraten, welche
|
||||
zusätzlichen Geräte man benötigt, um die Informationen dann für Lehre und
|
||||
Forschung zu verarbeiten? Ich bin nicht sicher: gibt es Geräte, die eine
|
||||
parallele Betrachtung ermöglichen? Benötigt man zusätzliche
|
||||
Speicherkapazitäten? Sehr dankbar wäre ich, wenn Sie die Infos gleich mit
|
||||
Kostenkalkulationen für den Antrag verbinden könnten.
|
||||
|
||||
Eine weitere Frage gleich nebenbei: Wird es an der TU auch die Möglichkeit
|
||||
geben, in den Hörsälen direkt zentral Podcasts der Vorlesungen
|
||||
aufzuzeichnen? Die Kollegen an der RUB verfügen über diese Möglichkeit
|
||||
jenseits individueller Camtasia-Aufzeichnungen. Dort wird das zentral und
|
||||
standardmäßig gemacht.
|
||||
|
||||
Ich arbeite momentan vom heimischen Schreibtisch aus. Sollten sie
|
||||
Rückfragen telefonisch machen wollen, erreichten Sie mich unter
|
||||
02302-9147798.
|
||||
|
||||
Ganz herzlichen Dank für Ihre Unterstützung!
|
||||
|
||||
Mit herzlichen Grüßen
|
||||
|
||||
Gudrun Marci-Boehncke
|
||||
|
||||
|
||||
Prof. Dr. Gudrun Marci-Boehncke
|
||||
TU Dortmund
|
||||
Institut für deutsche Sprache und Literatur
|
||||
Emil-Figge Str. 50, 3.241
|
||||
44227 Dortmund
|
||||
|
||||
IMAP0013 OK Completed (0.000 sec
|
||||
IMAP0013 OK Completed (0.000 sec
|
||||
IMAP0013 OK Completed (0.000 sec
|
||||
</Beschreibung>
|
||||
<Loesung>Problem wurde telefonisch besprochen und eine Beratung ist dort erfolgt. Weitere Kommunikation erfolgt via eMail.</Loesung>
|
||||
</ticket>
|
||||
</verzeichnis>
|
410
topicModeling.py
410
topicModeling.py
|
@ -3,6 +3,7 @@
|
|||
from datetime import datetime
|
||||
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
import csv
|
||||
import sys
|
||||
|
@ -20,8 +21,6 @@ import os
|
|||
csv.field_size_limit(sys.maxsize)
|
||||
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
|
||||
|
||||
# ssh madonna "nohup /usr/bin/python3 -u /home/jannis.grundmann/PycharmProjects/topicModelingTickets/topicModeling.py &> /home/jannis.grundmann/PycharmProjects/topicModelingTickets/printout_topicModeling.log &"
|
||||
|
||||
|
||||
# load config
|
||||
config_ini = FILEPATH + "config.ini"
|
||||
|
@ -35,12 +34,12 @@ def label2ID(label, labeldict):
|
|||
return labeldict.get(label, len(labeldict))
|
||||
|
||||
|
||||
def generate_labled_lines(textacyCorpus, labeldict):
|
||||
def generate_lablelID_lines(textacyCorpus, labeldict):
|
||||
for doc in textacyCorpus:
|
||||
# generate [topic1, topic2....] tok1 tok2 tok3 out of corpi
|
||||
yield "[" + str(label2ID(doc.metadata["categoryName"], labeldict)) + "] " + doc.text
|
||||
|
||||
|
||||
"""
|
||||
def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf', named_entities=True):
|
||||
logprint(str("ngrams: {0}".format(ngrams)))
|
||||
logprint(str("min_df: {0}".format(min_df)))
|
||||
|
@ -58,37 +57,47 @@ def printvecotorization(de_corpus, ngrams=1, min_df=1, max_df=1.0, weighting='tf
|
|||
print(t)
|
||||
logprint("doc_term_matrix: {0}".format(doc_term_matrix))
|
||||
logprint("id2term: {0}".format(id2term))
|
||||
"""
|
||||
|
||||
def textacyTopicModeling(corpus,
|
||||
n_topics = 15, top_topic_words = 7, top_document_labels_per_topic = 5,
|
||||
ngrams = 1, min_df=1, max_df=1.0,
|
||||
topicModel='lda'):
|
||||
|
||||
|
||||
|
||||
|
||||
n_terms = int(n_topics * top_topic_words)
|
||||
sort_terms_by = 'seriation' # 'seriation', 'weight', 'index', 'alphabetical'
|
||||
rank_terms_by = 'corpus' # 'corpus', 'topic'
|
||||
|
||||
|
||||
|
||||
|
||||
def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='lda', named_entities=False):
|
||||
logprint(
|
||||
"############################################ Topic Modeling {0} #############################################".format(
|
||||
"############### Topic Modeling {0} ###########################".format(
|
||||
topicModel))
|
||||
print("\n\n")
|
||||
logprint(str("ngrams: {0}".format(ngrams)))
|
||||
logprint(str("min_df: {0}".format(min_df)))
|
||||
logprint(str("max_df: {0}".format(max_df)))
|
||||
logprint(str("n_topics: {0}".format(n_topics)))
|
||||
logprint(str("named_entities: {0}".format(named_entities)))
|
||||
logprint("\n")
|
||||
|
||||
start = time.time()
|
||||
|
||||
top_topic_words = 7
|
||||
top_document_labels_per_topic = 5
|
||||
|
||||
# http://textacy.readthedocs.io/en/latest/api_reference.html#textacy.tm.topic_model.TopicModel.get_doc_topic_matrix
|
||||
weighting = ('tf' if topicModel == 'lda' else 'tfidf')
|
||||
|
||||
|
||||
|
||||
|
||||
####################'####################
|
||||
|
||||
|
||||
# printlog("vectorize corpi...")
|
||||
#################### vectorize corpi ####################
|
||||
|
||||
vectorizer = Vectorizer(weighting=weighting, min_df=min_df, max_df=max_df)
|
||||
|
||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=named_entities, as_strings=True) for doc in corpus)
|
||||
terms_list = (doc.to_terms_list(ngrams=ngrams, named_entities=False, as_strings=True) for doc in corpus)
|
||||
doc_term_matrix = vectorizer.fit_transform(terms_list)
|
||||
id2term = vectorizer.__getattribute__("id_to_term")
|
||||
|
||||
|
@ -97,44 +106,40 @@ def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='l
|
|||
|
||||
|
||||
|
||||
##################### LSA, LDA, NMF Topic Modeling via Textacy ##############################################
|
||||
|
||||
# Initialize and train a topic model
|
||||
# printlog("Initialize and train a topic model..")
|
||||
|
||||
##################### Initialize and train a topic model ##############################################
|
||||
|
||||
model = textacy.tm.TopicModel(topicModel, n_topics=n_topics)
|
||||
|
||||
model.fit(doc_term_matrix)
|
||||
|
||||
# Transform the corpi and interpret our model:
|
||||
# printlog("Transform the corpi and interpret our model..")
|
||||
doc_topic_matrix = model.transform(doc_term_matrix)
|
||||
print()
|
||||
|
||||
|
||||
for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_topic_words):
|
||||
logprint('topic {0}: {1}'.format(topic_idx, " ".join(top_terms)))
|
||||
|
||||
print()
|
||||
for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, top_n=top_document_labels_per_topic):
|
||||
logprint(topic_idx)
|
||||
for j in top_docs:
|
||||
logprint(corpus[j].metadata['categoryName'])
|
||||
print()
|
||||
|
||||
#####################################################################################################################
|
||||
print()
|
||||
print()
|
||||
|
||||
|
||||
# termite plot
|
||||
n_terms = int(n_topics*top_topic_words)
|
||||
sort_terms_by = 'seriation' #'seriation', 'weight', 'index', 'alphabetical'
|
||||
rank_terms_by = 'corpus' # 'corpus', 'topic'
|
||||
|
||||
|
||||
####################### termite plot ###################################################################
|
||||
|
||||
grams_label = "uni" if ngrams == 1 else "bi"
|
||||
|
||||
model.termite_plot(doc_term_matrix, id2term,
|
||||
|
||||
n_terms=n_terms,
|
||||
sort_terms_by=sort_terms_by,
|
||||
rank_terms_by=rank_terms_by+'_weight',
|
||||
|
||||
save="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/results/{}_{}_{}_{}_{}.png".format(topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by))
|
||||
|
||||
save= FILEPATH + "results/{}_{}_{}_{}_{}_{}.png".format(grams_label,topicModel,n_topics,n_terms,sort_terms_by,rank_terms_by))
|
||||
|
||||
|
||||
|
||||
|
@ -142,48 +147,51 @@ def textacyTopicModeling(ngrams, min_df, max_df, corpus, n_topics, topicModel='l
|
|||
logprint("\n\n\nTime Elapsed Topic Modeling with {1}:{0} min\n\n".format((end - start) / 60, topicModel))
|
||||
|
||||
|
||||
def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=False):
|
||||
##################### LLDA Topic Modeling via JGibbsLabledLDA ##############################################
|
||||
|
||||
def jgibbsLLDA(corpus, path2save_results, top_topic_words=7):
|
||||
start = time.time()
|
||||
|
||||
# build dictionary of ticketcategories
|
||||
labelist = []
|
||||
|
||||
|
||||
jgibbsLLDA_root = FILEPATH + "java_LabledLDA/"
|
||||
|
||||
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
||||
|
||||
|
||||
|
||||
|
||||
# build dictionary of ticketcategories
|
||||
labelist = []
|
||||
for texdoc in corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
|
||||
labelist.append(texdoc.metadata["categoryName"])
|
||||
|
||||
|
||||
labeldict = {k: v for v, k in enumerate(labelist)}
|
||||
reverse_labeldict = {v: k for k, v in labeldict.items()}
|
||||
|
||||
if add_default_topic:
|
||||
n_topics = len(labeldict) + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
else:
|
||||
n_topics = len(labeldict) # + 1 # len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
|
||||
jgibbsLLDA_root = FILEPATH + "/java_LabledLDA/"
|
||||
|
||||
LLDA_filepath = "{0}models/tickets/tickets.gz".format(jgibbsLLDA_root)
|
||||
#dict_path = "{0}models/tickets/labeldict.txt".format(jgibbsLLDA_root)
|
||||
dict_path = FILEPATH +"results/labeldict.txt".format(jgibbsLLDA_root)
|
||||
|
||||
# printlog(str("LABELDICT: {0}".format(labeldict)))
|
||||
#logprint(str("LABELDICT-length: {0}".format(len(labeldict))))
|
||||
with open(dict_path, 'w') as file:
|
||||
#and save
|
||||
labeldict_path = FILEPATH + "results/labeldict.txt"
|
||||
with open(labeldict_path, 'w') as file:
|
||||
file.write(json.dumps(labeldict))
|
||||
|
||||
# for line in generate_labled_lines(de_corpus,labeldict):
|
||||
# print(line)
|
||||
|
||||
# create file
|
||||
textacy.fileio.write_file_lines(generate_labled_lines(corpus, labeldict), filepath=LLDA_filepath)
|
||||
n_topics = len(labeldict) #+1 #default-topic
|
||||
|
||||
|
||||
|
||||
# create file with label_IDs (input for llda)
|
||||
textacy.fileio.write_file_lines(generate_lablelID_lines(corpus, labeldict), filepath=LLDA_filepath)
|
||||
|
||||
# wait for file to exist
|
||||
while not os.path.exists(LLDA_filepath):
|
||||
time.sleep(1)
|
||||
#top_topic_words=1
|
||||
|
||||
logprint("")
|
||||
logprint("start LLDA:")
|
||||
# run JGibsslda file
|
||||
|
||||
|
||||
# run JGibbsLLDA file
|
||||
|
||||
FNULL = open(os.devnull, 'w') # supress output
|
||||
cmd_jgibbs_java = ["java", "-cp",
|
||||
"{0}lib/trove-3.0.3.jar:{0}lib/args4j-2.0.6.jar:{0}out/production/LabledLDA/".format(
|
||||
|
@ -193,44 +201,20 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F
|
|||
"-twords", str(top_topic_words), "-ntopics", str(n_topics)]
|
||||
subprocess.call(cmd_jgibbs_java, stdout=FNULL)
|
||||
|
||||
|
||||
# ANMERKUNG: Dateien sind versteckt. zu finden in models/
|
||||
|
||||
# twords
|
||||
"""
|
||||
subprocess.call(["gzip",
|
||||
"-dc",
|
||||
"{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)])
|
||||
"""
|
||||
|
||||
cmd_gzip = ["gzip", "-dc", "{0}/models/tickets/.twords.gz".format(jgibbsLLDA_root)]
|
||||
"""
|
||||
proc = subprocess.Popen(cmd_gzip, stdout=subprocess.PIPE)
|
||||
|
||||
process = subprocess.Popen(cmd_gzip, shell=True,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
|
||||
# wait for the process to terminate
|
||||
out, err = process.communicate()
|
||||
errcode = process.returncode
|
||||
|
||||
result = subprocess.check_output(cmd_gzip)
|
||||
|
||||
#result = proc.stdout.read()
|
||||
result = proc.communicate()
|
||||
out=[]
|
||||
for line in result:
|
||||
out.append(line)
|
||||
"""
|
||||
|
||||
output = subprocess.check_output(cmd_gzip).decode("utf-8")
|
||||
|
||||
reverse_labeldict = {v: k for k, v in labeldict.items()}
|
||||
result = []
|
||||
regex = re.compile(r'Topic [0-9]*')
|
||||
for line in output.splitlines():
|
||||
|
||||
findall = regex.findall(line)
|
||||
topic_regex = re.compile(r'Topic [0-9]*')
|
||||
|
||||
#####################################
|
||||
# todo save results in file aufgrund von results
|
||||
result = []
|
||||
|
||||
for line in output.splitlines():
|
||||
findall = topic_regex.findall(line)
|
||||
if len(findall) != 0:
|
||||
try:
|
||||
index = int(findall[0].split()[1])
|
||||
|
@ -242,67 +226,136 @@ def jgibbsLLDA(corpus, path2save_results, top_topic_words=7, add_default_topic=F
|
|||
else:
|
||||
result.append(line)
|
||||
|
||||
textacy.fileio.write_file_lines(result, path2save_results)
|
||||
#####################################################################################################################
|
||||
textacy.fileio.write_file_lines(result, path2save_results+".txt")
|
||||
#####################################
|
||||
|
||||
#todo llda termite plot
|
||||
"""
|
||||
topic_inds=[] #<class 'list'>: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
|
||||
results = []
|
||||
res_dict = {}
|
||||
count =0
|
||||
for line in output.splitlines():
|
||||
|
||||
# get topic and term labels
|
||||
# <class 'tuple'>: ('topic 0', 'topic 1', 'topic 2', 'topic 3', 'topic 4', 'topic 5', 'topic 6', 'topic 7', 'topic 8', 'topic 9', 'topic 10', 'topic 11', 'topic 12', 'topic 13', 'topic 14')
|
||||
topic_labels = tuple('topic {}'.format(topic_ind) for topic_ind in topic_inds)
|
||||
findall = topic_regex.findall(line)
|
||||
|
||||
# <class 'tuple'>: ('hardware', 'raum', 'adresse', 'gebaeude', 'tu', 'uni', 'ticket', 'email', 'account', 'nummer', 'mail', 'outlook', 'karte', 'drucker', 'server', 'service', 'antwort', 'verbindung', 'herzliche', 'einrichten', 'vergessen', 'wenden', 'ews', 'anhang', 'form', 'konto', 'nachricht', 'unterstuetzung', 'passwort', 'unicard', 'semester', 'system', 'aenderung', 'rueckmeldung', 'meldung', 'zugreifen', 'login', 'adressat', 'sender', 'kurs', 'made', 'mittwoch', 'note', 'our', 'korrespondenz', 'unbeschadet', 'boss', 'unterrichten', 'telefax', 'zugang', 'probleme', 'zugriff', 'mitarbeiterin', 'internet', 'daten', 'anmeldung', 'aendern', 'unterschrift', 'loeschen', 'anmelden', 'datei', 'message', 'laptop', 'benoetigt', 'link', 'montag', 'programm', 'ordner', 'personal', 'rechner', 'veranstaltung', 'august', 'lizenz', 'anschluss', 'mitarbeiter', 'erwuenscht', 'umzug', 'pc', 'uniaccount', 'amt', 'fax', 'it', 'institut', 'nutzer', 'bild', 'type', 'prof', 'verantwortlicher', 'bemerkung', 'antragsteller', 'element', 'hahn', 'eintrag', 'telefonbuch', 'ansprechpartner', 'universitaet', 'physik', 'abteilung', 'fakultaet', 'software', 'dezernat', 'einrichtung', 'telefon', 'lehrstuhl', 'buero')
|
||||
term_labels = tuple(id2term[term_ind] for term_ind in term_inds)
|
||||
if len(findall) != 0:
|
||||
|
||||
if len(res_dict) != 0:
|
||||
results.append(res_dict) #vorheriges an die liste ran (ist ja dann fertig)
|
||||
|
||||
index = int(findall[0].split()[1])
|
||||
|
||||
res_dict = {index : str(reverse_labeldict[index]) }
|
||||
|
||||
else:
|
||||
splitted = line.split()
|
||||
res_dict[splitted[0]] = float(splitted[1])
|
||||
|
||||
### print terms that are topics
|
||||
for s in list(res_dict.values()):
|
||||
if isinstance(s,str) and splitted[0] in s:
|
||||
vals = list(res_dict.values())
|
||||
keys = list(res_dict.keys())
|
||||
for v in vals:
|
||||
if not isinstance(v,float):
|
||||
print("{}".format(v))
|
||||
print("{}".format(splitted[0]))
|
||||
count +=1
|
||||
print()
|
||||
###
|
||||
|
||||
|
||||
if len(res_dict) != 0:
|
||||
results.append(res_dict) # letzes an die liste ran
|
||||
|
||||
print(count)
|
||||
print(float(count)/float(len(labelist)))
|
||||
|
||||
|
||||
|
||||
|
||||
# {0: 'betrieb', 'service': 0.24162679425837305, 'support': 0.24162679425837305, 'browser': 0.24162679425837305, 'unicard': 0.24162679425837305, 'telefon': 0.0023923444976076593}
|
||||
|
||||
|
||||
# every term in the resulsts to a list
|
||||
|
||||
terms=[]
|
||||
for res in results:
|
||||
for key,value in res.items():
|
||||
if not isinstance(key, int) and not key in terms:
|
||||
terms.append(key)
|
||||
|
||||
term2id = {t:i for i,t in enumerate(terms)} #and to dict
|
||||
|
||||
################# termite plot #####################################################################
|
||||
|
||||
#term_topic_weights.shape = (len(term_ids),len(topic_ids)
|
||||
|
||||
|
||||
#topic_labels = tuple(labelist)
|
||||
|
||||
topic_labels = list(range(len(labelist)))
|
||||
term_labels = list(range(len(term2id))) #tuple([key for key in term2id.keys()])
|
||||
|
||||
|
||||
term_topic_weights = np.zeros((len(term2id),len(topic_labels)))
|
||||
|
||||
|
||||
|
||||
for i,res in enumerate(results):
|
||||
|
||||
for key,value in res.items():
|
||||
|
||||
if not isinstance(key, int):
|
||||
term_topic_weights[term2id[key]][i] = value
|
||||
term_labels[term2id[key]] = key
|
||||
else:
|
||||
topic_labels[i] = reverse_labeldict[key]
|
||||
|
||||
# get topic-term weights to size dots
|
||||
#[[ 0.02721858 -0.03898025 0.00047936 ..., 0.05862538 -0.07742336 0.04761928]
|
||||
# [ 0.14977875 -0.24192522 -0.00620335 ..., -0.0497216 0.08269951 -0.05715901]
|
||||
# [ 0.04977951 0.02296709 0.01214562 ..., 0.11444371 -0.15212482 0.21481788]
|
||||
# ...,
|
||||
# [
|
||||
term_topic_weights = np.array([self.model.components_[topic_ind][term_inds]
|
||||
for topic_ind in topic_inds]).T
|
||||
|
||||
viz.draw_termite_plot(
|
||||
term_topic_weights, topic_labels, term_labels, save=path2save_results)
|
||||
"""
|
||||
logprint("")
|
||||
term_topic_weights, topic_labels, term_labels, save=path2save_results+".png")
|
||||
|
||||
|
||||
end = time.time()
|
||||
logprint("\n\n\nTime Elapsed Topic Modeling JGibbsLLDA:{0} min\n\n".format((end - start) / 60))
|
||||
logprint("Time Elapsed Topic Modeling JGibbsLLDA:{0} min\n".format((end - start) / 60))
|
||||
|
||||
|
||||
def main(use_raw=False, algorithm="llda"):
|
||||
logprint("Topic Modeling: {0}".format(datetime.now()))
|
||||
|
||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||
|
||||
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||
|
||||
|
||||
if use_raw:
|
||||
# fehler Unknown document label ( X ) for document 352.
|
||||
preCorpus_name = "de" + "_raw_ticket"
|
||||
resultspath = FILEPATH + "results/raw"
|
||||
|
||||
else:
|
||||
preCorpus_name = "de" + "_pre_ticket"
|
||||
resultspath = FILEPATH + "results/pre"
|
||||
def main(use_cleaned=False, algorithm="llda"):
|
||||
|
||||
# load raw corpus and create new one
|
||||
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
||||
logprint("Corpus loaded: {0}".format(de_corpus.lang))
|
||||
|
||||
# idee http://bigartm.org/
|
||||
# idee http://wiki.languagetool.org/tips-and-tricks
|
||||
# idee https://en.wikipedia.org/wiki/Noisy_text_analytics
|
||||
# idee https://gate.ac.uk/family/
|
||||
|
||||
|
||||
|
||||
logprint("Topic Modeling: {0}".format(datetime.now()))
|
||||
|
||||
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
|
||||
corpus_en_path = FILEPATH + config.get("en_corpus", "path")
|
||||
|
||||
|
||||
if use_cleaned:
|
||||
preCorpus_name = "de" + "_clean_ticket"
|
||||
resultspath = FILEPATH + "results/clean"
|
||||
else:
|
||||
preCorpus_name = "de" + "_pre_ticket"
|
||||
resultspath = FILEPATH + "results/pre"
|
||||
|
||||
|
||||
|
||||
# load cleand corpus
|
||||
de_corpus, parser = load_corpus(corpus_name=preCorpus_name, corpus_path=corpus_de_path)
|
||||
logprint("Corpus loaded: {0}".format(de_corpus.lang))
|
||||
|
||||
|
||||
|
||||
# todo llda topics zusammenfassen
|
||||
# idee lda so trainieren, dass zuordnung term <-> topic nicht zu schwach wird, aber möglichst viele topics
|
||||
# frage lda wieviele tickets pro topic?
|
||||
# frage wieviele tickets pro topic?
|
||||
|
||||
"""
|
||||
ngrams = 1
|
||||
|
@ -324,47 +377,26 @@ def main(use_raw=False, algorithm="llda"):
|
|||
|
||||
if algorithm == "llda":
|
||||
top_topic_words = 5
|
||||
add_default_topic = False
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
top_topic_words = 5
|
||||
add_default_topic = True
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words))
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
|
||||
|
||||
"""
|
||||
top_topic_words = 10
|
||||
add_default_topic = False
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
top_topic_words = 10
|
||||
add_default_topic = True
|
||||
path2save_results = resultspath + "_{}_{}.txt".format("top"+str(top_topic_words), "wdef" if add_default_topic else "")
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words,
|
||||
add_default_topic=add_default_topic)
|
||||
|
||||
# no_below = 20
|
||||
# no_above = 0.5
|
||||
path2save_results = resultspath + "_{}_{}".format(algorithm,"top"+str(top_topic_words))
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
|
||||
|
||||
|
||||
# n_topics = len(LABELDICT)#len(set(ticketcorpus[0].metadata.keys()))+1 #+1 wegen einem default-topic
|
||||
top_topic_words = 15
|
||||
path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words))
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
|
||||
|
||||
top_topic_words = 20
|
||||
path2save_results = resultspath + "_{}_{}".format(algorithm, "top" + str(top_topic_words))
|
||||
jgibbsLLDA(de_corpus, path2save_results=path2save_results, top_topic_words=top_topic_words)
|
||||
|
||||
"""
|
||||
else:
|
||||
|
||||
# build dictionary of ticketcategories
|
||||
labelist = []
|
||||
|
||||
for texdoc in de_corpus.get(lambda texdoc: texdoc.metadata["categoryName"] not in labelist):
|
||||
labelist.append(texdoc.metadata["categoryName"])
|
||||
|
||||
labeldict = {k: v for v, k in enumerate(labelist)}
|
||||
|
||||
|
||||
|
||||
textacyTopicModeling(ngrams = 1,
|
||||
min_df = 1,
|
||||
|
@ -372,7 +404,7 @@ def main(use_raw=False, algorithm="llda"):
|
|||
topicModel = algorithm,
|
||||
n_topics =15,
|
||||
corpus=de_corpus)
|
||||
|
||||
"""
|
||||
textacyTopicModeling(ngrams=1,
|
||||
min_df=1,
|
||||
max_df=0.9,
|
||||
|
@ -394,7 +426,7 @@ def main(use_raw=False, algorithm="llda"):
|
|||
topicModel=algorithm,
|
||||
n_topics=30,
|
||||
corpus=de_corpus)
|
||||
|
||||
"""
|
||||
|
||||
|
||||
textacyTopicModeling(ngrams=(1, 2),
|
||||
|
@ -403,7 +435,7 @@ def main(use_raw=False, algorithm="llda"):
|
|||
topicModel=algorithm,
|
||||
n_topics=15,
|
||||
corpus=de_corpus)
|
||||
|
||||
"""
|
||||
textacyTopicModeling(ngrams = (1,2),
|
||||
min_df = 1,
|
||||
max_df = 0.9,
|
||||
|
@ -425,59 +457,7 @@ def main(use_raw=False, algorithm="llda"):
|
|||
topicModel = algorithm,
|
||||
n_topics =30,
|
||||
corpus=de_corpus)
|
||||
|
||||
|
||||
|
||||
"""
|
||||
textacyTopicModeling(ngrams = (1,2),
|
||||
min_df = 1,
|
||||
max_df = 0.8,
|
||||
topicModel = algorithm,
|
||||
n_topics =n_topics,
|
||||
corpus=de_corpus)
|
||||
|
||||
"""
|
||||
|
||||
"""
|
||||
textacyTopicModeling(ngrams = 1,
|
||||
min_df = 0.1,
|
||||
max_df = 0.6,
|
||||
topicModel = algorithm,
|
||||
n_topics =n_topics,
|
||||
corpus=de_corpus)
|
||||
|
||||
|
||||
|
||||
|
||||
textacyTopicModeling(ngrams = (1,2),
|
||||
min_df = 1,
|
||||
max_df = 1.0,
|
||||
topicModel = algorithm,
|
||||
n_topics =n_topics,
|
||||
corpus=de_corpus)
|
||||
|
||||
textacyTopicModeling(ngrams = (1,2),
|
||||
min_df = 0.1,
|
||||
max_df = 0.6,
|
||||
topicModel = algorithm,
|
||||
n_topics =n_topics,
|
||||
corpus=de_corpus)
|
||||
|
||||
|
||||
|
||||
|
||||
textacyTopicModeling(ngrams = (1,2),
|
||||
min_df = 0.2,
|
||||
max_df = 0.8,
|
||||
topicModel = algorithm,
|
||||
n_topics = 20,
|
||||
corpus=de_corpus)
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
20561
vornamen.txt
20561
vornamen.txt
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue