# Put your text in a file called input.txt in the same directory. import subprocess import csv import string import time class MeCabNode: def __init__(self, term="", pos="", pos1="", pos2="", pos3="", rule="", conj="", dictform="", reading="", pronunc=""): self.term = term self.pos = pos self.pos1 = pos1 self.pos2 = pos2 self.pos3 = pos3 self.rule = rule self.conj = conj self.dictform = dictform self.reading = reading self.pronunc = pronunc def toStringExtended(self): return "Term: " + self.term + "\nPos: " + self.pos + "\nPos1: " + self.pos1 + "\nPos2: " + self.pos2 + "\nPos3: " + self.pos3 + "\nRule: " + self.rule + "\nConjugation: " + self.conj + "\nDictionary Form: " + self.dictform + "\nReading: " + self.reading + "\nPronunciation: " + self.pronunc + "\n" def toString(self): return self.dictform class MeCab: # Assumes that MeCab is installed and has been added to the system path. def getMatrix(self, mecabdir): subprocess.Popen('"'+mecabdir+'\\mecab.exe" input.txt -o mecab-raw-output.txt', shell=True) # Waits for MeCab to finish working. time.sleep(5) csvfile = open("mecab-raw-output.txt", encoding='utf-8') FILE = open("mecab-output-tidied.txt", "w", encoding='utf-8') for line in csvfile: if "\t" in line: line = line.replace("\t", ",") FILE.write(line) FILE.close() csvfile.close() csvfile = open("mecab-output-tidied.txt", encoding='utf-8') matrix = list(csv.reader(csvfile)) FILE.close() return matrix def getNode(self, line): node = MeCabNode() if len(line) > 0: node.term = line[0] node.pos = line[1] node.pos1 = line[2] node.pos2 = line[3] node.pos3 = line[4] node.rule = line[5] node.conj = line[6] if len(line) >= 8: node.dictform = line[7] if len(line) >= 9: node.reading = line[8] if len(line) >= 10: node.pronunc = line[9] return node def isValidTerm(self, term): if term == "" or term == "*" or term == "。" or term == "、" or term == "(" or term == ")" or term == "の" or term == "は" or term == "が" or term == "と" or term == "で" or term == "を" or term == "に": return False else: return True def getNodeSet(self, mecabdir): matrix = self.getMatrix(mecabdir) nodeset = set([]) for line in matrix: node = self.getNode(line) if node != None: if self.isValidTerm(node.dictform): nodeset.add(node) return nodeset mecab = MeCab() nodeset = mecab.getNodeSet("C:\\Program Files (x86)\\MeCab\\bin") FILE = open("output.txt", "w", encoding='utf-8') for node in nodeset: FILE.write(node.toString() + "\n") FILE.close()