macbert/ceshifenli.py


								import unicodedata

								def is_chinese(char):

								    if 'CJK' in unicodedata.name(char):

								        return True

								    else:

								        return False


								a = "ab1我们12是一个"


								b = [""] *len(a)


								last_post = False


								c = []

								for i, d in enumerate(a):

								    bool_ = is_chinese(d)

								    if bool_ == False:

								        b[i] = d

								        last_post = False

								    else:

								        if last_post == False:

								            c.append([(i,d)])

								        else:

								            c[-1].append((i,d))

								        last_post = True

								print(c)

								print(b)


								d = []

								for i in c:

								    d.append("".join([j[1] for j in i]))

								print(d)


								e = d


								f = ""

								for i in e:

								    f += i

								f_list = list(f)

								print(f_list)


								for i,d in enumerate(b):

								    if d == "":

								        zi = f_list.pop(0)

								        print(zi)

								        b[i] = zi

								print(b)


								class SentenceUlit:

								    def __init__(self,sentence):

								        self.sentence = sentence

								        self.sentence_list = [""] * len(sentence)

								        self.last_post = False

								        self.sentence_batch = []

								        self.pre_ulit()

								        self.inf_sentence_batch_str = ""


								    def is_chinese(self, char):

								        if 'CJK' in unicodedata.name(char):

								            return True

								        else:

								            return False


								    def pre_ulit(self):

								        for i, d in enumerate(self.sentence):

								            bool_ = is_chinese(d)

								            if bool_ == False:

								                self.sentence_list[i] = d

								                self.last_post = False

								            else:

								                if self.last_post == False:

								                    self.sentence_batch.append(d)

								                else:

								                    self.sentence_batch[-1] += d

								                self.last_post = True


								    def inf_ulit(self, sen):

								        for i in sen:

								            self.inf_sentence_batch_str += i

								        self.inf_sentence_batch_srt_list = list(self.inf_sentence_batch_str)


								        for i, d in enumerate(self.sentence_list):

								            if d == "":

								                zi = self.inf_sentence_batch_srt_list.pop(0)

								                self.sentence_list[i] = zi


								sen = SentenceUlit("ab1我们12是一个")


								print(sen.sentence_batch)

								print(sen.sentence_list)