from docx import Document import platform import os import concurrent.futures os.environ['ALL_PROXY'] = 'http://127.0.0.1:10809' import docx import json import re from docx.document import Document from docx.oxml.table import CT_Tbl from docx.oxml.text.paragraph import CT_P from docx.table import _Cell, Table from docx.text.paragraph import Paragraph import requests import random import time from flask import Flask, render_template, request, redirect, url_for, jsonify from werkzeug.utils import secure_filename app = Flask(__name__) # 上传文件存储目录 UPLOAD_FOLDER = 'uploads' app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER # 正则表达式 RE_CHINA_NUMS = "[1-9].(.*)" # 允许的文件类型 ALLOWED_EXTENSIONS = {'docx'} pantten_mulu = '目录(.*?)致谢' pantten_xiaobiaoti = "{}(.*?){}" pantten_yijibiaoti = '^([一二三四五六七八九])、(.*)' pantten_erjibiaoti = '^[0-9](\.[0-9]\d*){1}\s{1,}?.*$' pantten_content_tiaoshu = '[0-9]\.{1}\s{0,}?(.*)' pantten_yijibiaoti_content = '^[一二三四五六七八九]、(.*)' pantten_erjibiaoti_content = '^[0-9]\.[0-9]\s{1,}?(.*)$' prompt_two_title_min_max = "为论文题目“{}”生成中文目录,要求只有一级标题,二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题生成{}个;每个一级标题包含{}-{}个二级标题" prompt_two_title_not_min_max = "为论文题目“{}”生成中文目录,要求只有一级标题,二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题生成{}个;每个一级标题包含{}个二级标题" pantten_title = "为论文题目“(.*?)”生成中文目录" pantten_xiaobiaoti_geshu = "每个一级标题包含(.*?)个" pantten_dabiaoti_geshu = "一级标题生成(.*?)个" mulusuojian = "请问把以下目录缩减成只有4个一级标题作为ppt的题目,请问留下原始目录中的哪4个一级标题最合适,一级标题必须在原始目录中\n{}\n" self_api = "http://192.168.31.149:12004/predict" gpt_api = "https://api.openai.com/v1/chat/completions" class log: def __init__(self): pass def log(*args, **kwargs): format = '%Y/%m/%d-%H:%M:%S' format_h = '%Y-%m-%d' value = time.localtime(int(time.time())) dt = time.strftime(format, value) dt_log_file = time.strftime(format_h, value) log_file = 'log_file/access-%s' % dt_log_file + ".log" if not os.path.exists(log_file): with open(os.path.join(log_file), 'w', encoding='utf-8') as f: print(dt, *args, file=f, **kwargs) else: with open(os.path.join(log_file), 'a+', encoding='utf-8') as f: print(dt, *args, file=f, **kwargs) def allowed_file(filename): return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS def iter_block_items(parent): """ Yield each paragraph and table child within *parent*, in document order. Each returned value is an instance of either Table or Paragraph. *parent* would most commonly be a reference to a main Document object, but also works for a _Cell object, which itself can contain paragraphs and tables. """ if isinstance(parent, Document): parent_elm = parent.element.body elif isinstance(parent, _Cell): parent_elm = parent._tc else: raise ValueError("something's not right") for child in parent_elm.iterchildren(): if isinstance(child, CT_P): yield Paragraph(child, parent) elif isinstance(child, CT_Tbl): yield Table(child, parent) def read_table(table): return [[cell.text for cell in row.cells] for row in table.rows] def read_word(word_path): paper_text = [] doc = docx.Document(word_path) for block in iter_block_items(doc): if isinstance(block, Paragraph): paper_text.append(block.text) elif isinstance(block, Table): table_list = read_table(block) table_list_new = [] for row in table_list: table_list_new.append("" + "\n".join(row) + "") table_str = "\n\n" + "\n\n\n".join(table_list_new) + "\n\n" table_str = "\n" + table_str + "
\n\n" paper_text.append(table_str) paper_text = "\n".join(paper_text) return paper_text def getText(fileName): ''' ''' doc = docx.Document(fileName) TextList = [] for paragraph in doc.paragraphs: TextList.append(paragraph.text) return '\n'.join(TextList) def request_selfmodel_api(prompt): url = "http://192.168.31.149:12004/predict" data = { "model": "gpt-4-turbo-preview", "messages": [ {"role": "user", "content": prompt} ], "top_p": 0.7, "temperature": 0.95 } response = requests.post( url, json=data, timeout=100000 ) return response.json() def request_chatgpt_api(prompt): ''' ''' OPENAI_API_KEY = "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" url = "https://api.openai.com/v1/chat/completions" # url = "https://one.aiskt.com" headers = { "Content-Type": "application/json", "Authorization": f"Bearer {OPENAI_API_KEY}" } data = { "model": "gpt-4-turbo-preview", "messages": [ {"role": "user", "content": prompt} ], "top_p": 0.7, "temperature": 0.95 } response = requests.post(url, headers=headers, data=json.dumps(data), timeout=1200) return response.json() def yanzhengyijibiaoti(mulu, res): ''' 判断生成的大标题是否可用 :param mulu: :param res: :return: ''' mulu_list = str(mulu).split("\n") dabiaoti_list = [] dabiaoti_res_list = [] for i in mulu_list: res_re = re.findall(pantten_yijibiaoti, i) if res_re != []: dabiaoti_list.append(re.findall(pantten_yijibiaoti, i)[0]) for i in dabiaoti_list: if i[1].strip() in res: dabiaoti_res_list.append("、".join(i)) if len(dabiaoti_res_list) == 4: return_bool = True else: return_bool = False return return_bool, dabiaoti_res_list def get_document_structure(file_path): doc = docx.Document(file_path) structure = [] for paragraph in doc.paragraphs: style = paragraph.style if style.name.startswith("Heading"): level = int(style.name[7:]) text = paragraph.text structure.append((level, text)) return structure def print_document_structure(structure): for level, text in structure: print(f"{' ' * (level - 1)}- {text}") def catalogue_extract(structure): catalogue_list = [] for level, text in structure[:-1]: text = str(text).strip(" ").strip("\n") erji_p = re.findall(pantten_erjibiaoti, text) yiji_p = re.findall(pantten_yijibiaoti, text) if erji_p == [] and yiji_p == []: continue catalogue_list.append(str(text).strip(" ").strip("\n")) catalogue_str = "\n".join(catalogue_list) return catalogue_list, catalogue_str def main(path): # 判断系统 system = platform.system() if system == 'Linux': file_name = path.split("/")[-1].split(".")[0] else: file_name = path.split("\\")[-1].split(".")[0] text_1 = json.dumps(read_word(path), ensure_ascii=False) print(text_1) # mulu_str = re.findall(pantten_mulu, text_1)[0] # print(mulu_str) # mulu_list_xuhao = str(mulu_str).split("\\n") # # # mulu_list = [] # for i in mulu_list_xuhao: # if i != "": # mulu_list.append(i.split("\\t")[0]) # # mulu_list.append("致谢") # # print(mulu_list) document_structure = get_document_structure(path) mulu_list, catalogue_str = catalogue_extract(document_structure) text = text_1.split("目录")[-1].strip("\\n") yijibiaoti = "" paper_content = {} for i in range(len(mulu_list) - 1): title = mulu_list[i].strip(" ").strip("\\n") print(mulu_list[i]) print(mulu_list[i + 1]) print(re.findall(pantten_xiaobiaoti.format(mulu_list[i], mulu_list[i + 1]), text)) content = str(re.findall(pantten_xiaobiaoti.format(mulu_list[i], mulu_list[i + 1]), text)[0]).strip( " ").strip("\\n") # print(title) # print(content) yijibiaoti_res = re.findall(pantten_yijibiaoti, title) erjibiaoti_res = re.findall(pantten_erjibiaoti, title) if yijibiaoti_res != []: # title = "、".join([yijibiaoti_res[0][1], yijibiaoti_res[0][1].strip()]) paper_content[title] = {} yijibiaoti = title continue elif erjibiaoti_res != []: paper_content[yijibiaoti][title] = content.replace("\\n", "\n") else: paper_content[yijibiaoti][title] += "\n".join(title + content) while True: mulu_str = "\n".join(mulu_list[:-1]) prompt = f'请问把以下目录缩减成只有4个一级标题作为ppt的题目,请问留下原始目录中的哪4个一级标题最合适,一级标题必须在原始目录中\n{mulu_str}\n' try: res = request_chatgpt_api(prompt)['choices'][0]['message']['content'] except: continue # res = '''根据您提供的目录内容,如果要将其缩减为只包含4个一级标题的PPT题目,建议选择以下四个一级标题,因为它们分别代表了研究的引入、理论框架、实际应用与实践,以及未来展望,从而形成了一个完整的研究过程和内容框架: # # 1. 一、绪论 # 2. 二、电影网站设计的基本概念 # 3. 三、Python在电影网站设计中的应用 # 4. 五、电影网站设计的实践与展望 # # 这样的选择既涵盖了研究的背景、目的与意义(绪论),也包括了研究的理论基础(电影网站设计的基本概念),以及研究的实际操作和技术实现(Python在电影网站设计中的应用),最后还有对项目实践经验的总结和对未来发展的展望(电影网站设计的实践与展望)。这四个部分共同构成了一个完整的研究报告或项目介绍的框架,能够全面展示电影网站设计项目的各个方面。 # ''' shaixuan_bool, dabiaoti_res_list = yanzhengyijibiaoti("\n".join(mulu_list), res.replace("\n", "\\n")) if shaixuan_bool == True: break index_zahnweifu = 0 zhanweifu = [] content_1 = [] catalogue = [] for yijibiaoti in dabiaoti_res_list: content_2 = [] yijibiaoti_content = re.findall(pantten_yijibiaoti_content, yijibiaoti)[0] catalogue.append(yijibiaoti_content) for erjibiaoti in paper_content[yijibiaoti]: num = random.randint(2, 6) content = paper_content[yijibiaoti][erjibiaoti] # res = request_selfmodel_api( # f'任务:生成段落主要内容\n请对以下内容进行提取信息,只需要提取{str(num)}条主要内容,使用条数罗列下面这段话的主要信息,例如1. xxx\n2.xxx \n' + content)[ # 'choices'][0]['message']['content'] zhanweifu.append(f'任务:生成段落主要内容\n请对以下内容进行提取信息,只需要提取{str(num)}条主要内容,使用条数罗列下面这段话的主要信息,例如1. xxx\n2.xxx \n' + content) content_2.append({ "title_small": re.findall(pantten_erjibiaoti_content, erjibiaoti)[0], "content_3": index_zahnweifu }) index_zahnweifu += 1 content_1.append({ "title_big": yijibiaoti_content, "content_2": content_2 }) with concurrent.futures.ThreadPoolExecutor(100) as executor: results = executor.map(request_selfmodel_api, zhanweifu) zhanweifu = [] for result in results: res = result['choices'][0]['message']['content'] tiaoshu_list = str(res).split("\n") tiaoshu_list_new = [] for dantiao in tiaoshu_list: tiaoshu_list_new.append(re.findall(pantten_content_tiaoshu, dantiao)[0].strip()) zhanweifu.append(tiaoshu_list_new) content_1_new = [] for yijibiaoti_content in content_1: content_2_new = [] title_big = yijibiaoti_content["title_big"] for erjibiaoti_content in yijibiaoti_content["content_2"]: title_small = erjibiaoti_content["title_small"] content_3 = zhanweifu[erjibiaoti_content["content_3"]] content_2_new.append({ "title_small": title_small, "content_3": content_3 }) content_1_new.append({ "title_big": title_big, "content_2": content_2_new }) data_new = { "title": file_name, "catalogue": catalogue, "content_1": content_1_new } # with open("data/ceshi.json", "w", encoding="utf-8") as f: # f.write(json.dumps(data_new, ensure_ascii=False, indent=2)) return data_new @app.route('/predict', methods=['POST']) def upload_file(): if 'file' not in request.files: return "1" file = request.files.get('file') if file and allowed_file(file.filename): path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename) print(path) file.save(path) # file.save(file.filename) result = main(path) return_text = {"texts": result, "probabilities": None, "status_code": 200} log.log('start at', 'filename:{}, result:{}'.format( path, return_text)) return jsonify(return_text) else: return "不允许的文件类型" if __name__ == "__main__": app.run(host="0.0.0.0", port=21000, threaded=True)