You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							94 lines
						
					
					
						
							2.0 KiB
						
					
					
				
			
		
		
		
			
			
			
				
					
				
				
					
				
			
		
		
	
	
							94 lines
						
					
					
						
							2.0 KiB
						
					
					
				
								import unicodedata
							 | 
						|
								def is_chinese(char):
							 | 
						|
								    if 'CJK' in unicodedata.name(char):
							 | 
						|
								        return True
							 | 
						|
								    else:
							 | 
						|
								        return False
							 | 
						|
								
							 | 
						|
								
							 | 
						|
								
							 | 
						|
								a = "ab1我们12是一个"
							 | 
						|
								
							 | 
						|
								b = [""] *len(a)
							 | 
						|
								
							 | 
						|
								last_post = False
							 | 
						|
								
							 | 
						|
								c = []
							 | 
						|
								for i, d in enumerate(a):
							 | 
						|
								    bool_ = is_chinese(d)
							 | 
						|
								    if bool_ == False:
							 | 
						|
								        b[i] = d
							 | 
						|
								        last_post = False
							 | 
						|
								    else:
							 | 
						|
								        if last_post == False:
							 | 
						|
								            c.append([(i,d)])
							 | 
						|
								        else:
							 | 
						|
								            c[-1].append((i,d))
							 | 
						|
								        last_post = True
							 | 
						|
								print(c)
							 | 
						|
								print(b)
							 | 
						|
								
							 | 
						|
								d = []
							 | 
						|
								for i in c:
							 | 
						|
								    d.append("".join([j[1] for j in i]))
							 | 
						|
								print(d)
							 | 
						|
								
							 | 
						|
								e = d
							 | 
						|
								
							 | 
						|
								f = ""
							 | 
						|
								for i in e:
							 | 
						|
								    f += i
							 | 
						|
								f_list = list(f)
							 | 
						|
								print(f_list)
							 | 
						|
								
							 | 
						|
								for i,d in enumerate(b):
							 | 
						|
								    if d == "":
							 | 
						|
								        zi = f_list.pop(0)
							 | 
						|
								        print(zi)
							 | 
						|
								        b[i] = zi
							 | 
						|
								print(b)
							 | 
						|
								
							 | 
						|
								class SentenceUlit:
							 | 
						|
								    def __init__(self,sentence):
							 | 
						|
								        self.sentence = sentence
							 | 
						|
								        self.sentence_list = [""] * len(sentence)
							 | 
						|
								        self.last_post = False
							 | 
						|
								        self.sentence_batch = []
							 | 
						|
								        self.pre_ulit()
							 | 
						|
								        self.inf_sentence_batch_str = ""
							 | 
						|
								
							 | 
						|
								
							 | 
						|
								    def is_chinese(self, char):
							 | 
						|
								        if 'CJK' in unicodedata.name(char):
							 | 
						|
								            return True
							 | 
						|
								        else:
							 | 
						|
								            return False
							 | 
						|
								
							 | 
						|
								    def pre_ulit(self):
							 | 
						|
								        for i, d in enumerate(self.sentence):
							 | 
						|
								            bool_ = is_chinese(d)
							 | 
						|
								            if bool_ == False:
							 | 
						|
								                self.sentence_list[i] = d
							 | 
						|
								                self.last_post = False
							 | 
						|
								            else:
							 | 
						|
								                if self.last_post == False:
							 | 
						|
								                    self.sentence_batch.append(d)
							 | 
						|
								                else:
							 | 
						|
								                    self.sentence_batch[-1] += d
							 | 
						|
								                self.last_post = True
							 | 
						|
								
							 | 
						|
								    def inf_ulit(self, sen):
							 | 
						|
								        for i in sen:
							 | 
						|
								            self.inf_sentence_batch_str += i
							 | 
						|
								        self.inf_sentence_batch_srt_list = list(self.inf_sentence_batch_str)
							 | 
						|
								
							 | 
						|
								        for i, d in enumerate(self.sentence_list):
							 | 
						|
								            if d == "":
							 | 
						|
								                zi = self.inf_sentence_batch_srt_list.pop(0)
							 | 
						|
								                self.sentence_list[i] = zi
							 | 
						|
								
							 | 
						|
								
							 | 
						|
								sen = SentenceUlit("ab1我们12是一个")
							 | 
						|
								
							 | 
						|
								print(sen.sentence_batch)
							 | 
						|
								print(sen.sentence_list)
							 |