import unicodedata def is_chinese(char): if 'CJK' in unicodedata.name(char): return True else: return False a = "ab1我们12是一个" b = [""] *len(a) last_post = False c = [] for i, d in enumerate(a): bool_ = is_chinese(d) if bool_ == False: b[i] = d last_post = False else: if last_post == False: c.append([(i,d)]) else: c[-1].append((i,d)) last_post = True print(c) print(b) d = [] for i in c: d.append("".join([j[1] for j in i])) print(d) e = d f = "" for i in e: f += i f_list = list(f) print(f_list) for i,d in enumerate(b): if d == "": zi = f_list.pop(0) print(zi) b[i] = zi print(b) class SentenceUlit: def __init__(self,sentence): self.sentence = sentence self.sentence_list = [""] * len(sentence) self.last_post = False self.sentence_batch = [] self.pre_ulit() self.inf_sentence_batch_str = "" def is_chinese(self, char): if 'CJK' in unicodedata.name(char): return True else: return False def pre_ulit(self): for i, d in enumerate(self.sentence): bool_ = is_chinese(d) if bool_ == False: self.sentence_list[i] = d self.last_post = False else: if self.last_post == False: self.sentence_batch.append(d) else: self.sentence_batch[-1] += d self.last_post = True def inf_ulit(self, sen): for i in sen: self.inf_sentence_batch_str += i self.inf_sentence_batch_srt_list = list(self.inf_sentence_batch_str) for i, d in enumerate(self.sentence_list): if d == "": zi = self.inf_sentence_batch_srt_list.pop(0) self.sentence_list[i] = zi sen = SentenceUlit("ab1我们12是一个") print(sen.sentence_batch) print(sen.sentence_list)