topicModelingTickets/cleaning.py

174 lines
3.6 KiB
Python

# -*- coding: utf-8 -*-
import os
import time
from datetime import datetime
import textacy
from scipy import *
from miscellaneous import *
csv.field_size_limit(sys.maxsize)
FILEPATH = os.path.dirname(os.path.realpath(__file__)) + "/"
# load config
config_ini = FILEPATH + "config.ini"
config = ConfigParser.ConfigParser()
with open(config_ini) as f:
config.read_file(f)
############# stringcleaning
def clean(stringstream):#, NOUNS):
"""
fix bad unicode
seperate_words_on_regex `\=~%^&*()_+\[\]{};\'"|</>
normalize whitespace
remove linebreaks
replaceRockDöts
:param stringstream: str-gen
:return: string-gen
"""
#NOUNS = [n.lower() for n in NOUNS]
for string in stringstream:
# fixUnicode
string = textacy.preprocess.fix_bad_unicode(string)
#string = textacy.preprocess.unidecode(string)
# seperate_words_on_regex:
string = " ".join(re.compile(r'[`\=~%^&*()_+\[\]{};\'"|</>]').split(string))
#normalize whitespace
string = textacy.preprocess.normalize_whitespace(string)
#remove linebreaks
string = re.sub(r'[\n]', " ", string)
string = replaceRockDots(string)
"""
# fehler großschreibung durch nomenliste zu korrigieren funzt nicht so richtig, da auch innerhalb des Statzes wörter verändert werden.
#for n in nouns:
# string = string.replace(n.lower(),n)
#string = multisub(nouns_tuples,string)
#https://stackoverflow.com/questions/10968558/python-re-sub-with-a-list-of-words-to-find
#string = re.sub(r'[\n]', " ", string)
#string = string.replace(noun,noun.title()) for noun in nouns
splitted = string.split()
for i,s in enumerate(splitted):
if s in NOUNS:
splitted[i] = s.title()
if i != 0:
for punct in ":.!?":
if punct in splitted[i - 1]:
splitted[i] = s.title()
string = " ".join(splitted)
"""
yield string
##################################################################################################
corpus_de_path = FILEPATH + config.get("de_corpus", "path")
def cleanCorpus(corpus):
logprint("Clean {0}_corpus at {1}".format(corpus.lang, datetime.now()))
"""
ressources_path = FILEPATH + "ressources/"
path2nouns_list = ressources_path + config.get("nouns", "pickle_file")
#NOUNS = load_obj(path2nouns_list)
#noun_disjunction = '|'.join(NOUNS)
#nouns_tuples = []
#for n in NOUNS:
# nouns_tuples.append((n.lower(),n))
"""
# load Corpus
raw_corpus = corpus
parser = corpus.spacy_lang
# Actually clean the corpus
cleaned_corpus = textacy.Corpus(parser)
cleaned_corpus.add_texts(
clean(corpus2Text(raw_corpus)),
corpus2Meta(raw_corpus)
)
# leere docs aus corpus kicken
cleaned_corpus.remove(lambda doc: len(doc) == 0)
#save corpus
cleanCorpus_name = corpus.lang + "_clean"
save_corpus(corpus=cleaned_corpus, corpus_path=corpus_de_path, corpus_name=cleanCorpus_name)
return cleaned_corpus
def main(corpus):
start = time.time()
cleaned_corpus = cleanCorpus(corpus)
end = time.time()
logprint("Time Elapsed Cleaning:{0} min".format((end - start) / 60))
return cleaned_corpus
if __name__ == "__main__":
corpus, parser = load_corpus(corpus_path="/home/jannis.grundmann/PycharmProjects/topicModelingTickets/corpi/",
corpus_name="de_raw")
main(corpus)