|
|
|
from docx import Document
|
|
|
|
import platform
|
|
|
|
import os
|
|
|
|
import concurrent.futures
|
|
|
|
os.environ['ALL_PROXY'] = 'http://127.0.0.1:10809'
|
|
|
|
import docx
|
|
|
|
import json
|
|
|
|
import re
|
|
|
|
from docx.document import Document
|
|
|
|
from docx.oxml.table import CT_Tbl
|
|
|
|
from docx.oxml.text.paragraph import CT_P
|
|
|
|
from docx.table import _Cell, Table
|
|
|
|
from docx.text.paragraph import Paragraph
|
|
|
|
import requests
|
|
|
|
import random
|
|
|
|
import time
|
|
|
|
from flask import Flask, render_template, request, redirect, url_for, jsonify
|
|
|
|
from werkzeug.utils import secure_filename
|
|
|
|
app = Flask(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
# 上传文件存储目录
|
|
|
|
UPLOAD_FOLDER = 'uploads'
|
|
|
|
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
|
|
|
|
|
|
|
# 正则表达式
|
|
|
|
RE_CHINA_NUMS = "[1-9].(.*)"
|
|
|
|
# 允许的文件类型
|
|
|
|
ALLOWED_EXTENSIONS = {'docx'}
|
|
|
|
|
|
|
|
|
|
|
|
pantten_mulu = '目录(.*?)致谢'
|
|
|
|
pantten_xiaobiaoti = "{}(.*?){}"
|
|
|
|
pantten_yijibiaoti = '^([一二三四五六七八九])、(.*)'
|
|
|
|
pantten_erjibiaoti = '^[0-9](\.[0-9]\d*){1}\s{1,}?.*$'
|
|
|
|
pantten_content_tiaoshu = '[0-9]\.{1}\s{0,}?(.*)'
|
|
|
|
|
|
|
|
pantten_yijibiaoti_content = '^[一二三四五六七八九]、(.*)'
|
|
|
|
pantten_erjibiaoti_content = '^[0-9]\.[0-9]\s{1,}?(.*)$'
|
|
|
|
|
|
|
|
prompt_two_title_min_max = "为论文题目“{}”生成中文目录,要求只有一级标题,二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题生成{}个;每个一级标题包含{}-{}个二级标题"
|
|
|
|
prompt_two_title_not_min_max = "为论文题目“{}”生成中文目录,要求只有一级标题,二级标题,一级标题使用中文数字 例如一、xxx;二级标题使用阿拉伯数字 例如1.1 xxx;一级标题生成{}个;每个一级标题包含{}个二级标题"
|
|
|
|
|
|
|
|
pantten_title = "为论文题目“(.*?)”生成中文目录"
|
|
|
|
pantten_xiaobiaoti_geshu = "每个一级标题包含(.*?)个"
|
|
|
|
pantten_dabiaoti_geshu = "一级标题生成(.*?)个"
|
|
|
|
|
|
|
|
mulusuojian = "请问把以下目录缩减成只有4个一级标题作为ppt的题目,请问留下原始目录中的哪4个一级标题最合适,一级标题必须在原始目录中\n{}\n"
|
|
|
|
|
|
|
|
self_api = "http://192.168.31.149:12004/predict"
|
|
|
|
gpt_api = "https://api.openai.com/v1/chat/completions"
|
|
|
|
|
|
|
|
|
|
|
|
class log:
|
|
|
|
def __init__(self):
|
|
|
|
pass
|
|
|
|
|
|
|
|
def log(*args, **kwargs):
|
|
|
|
format = '%Y/%m/%d-%H:%M:%S'
|
|
|
|
format_h = '%Y-%m-%d'
|
|
|
|
value = time.localtime(int(time.time()))
|
|
|
|
dt = time.strftime(format, value)
|
|
|
|
dt_log_file = time.strftime(format_h, value)
|
|
|
|
log_file = 'log_file/access-%s' % dt_log_file + ".log"
|
|
|
|
if not os.path.exists(log_file):
|
|
|
|
with open(os.path.join(log_file), 'w', encoding='utf-8') as f:
|
|
|
|
print(dt, *args, file=f, **kwargs)
|
|
|
|
else:
|
|
|
|
with open(os.path.join(log_file), 'a+', encoding='utf-8') as f:
|
|
|
|
print(dt, *args, file=f, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
def allowed_file(filename):
|
|
|
|
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
|
|
|
|
|
|
|
|
|
|
|
def iter_block_items(parent):
|
|
|
|
"""
|
|
|
|
Yield each paragraph and table child within *parent*, in document order.
|
|
|
|
Each returned value is an instance of either Table or Paragraph. *parent*
|
|
|
|
would most commonly be a reference to a main Document object, but
|
|
|
|
also works for a _Cell object, which itself can contain paragraphs and tables.
|
|
|
|
"""
|
|
|
|
if isinstance(parent, Document):
|
|
|
|
parent_elm = parent.element.body
|
|
|
|
elif isinstance(parent, _Cell):
|
|
|
|
parent_elm = parent._tc
|
|
|
|
else:
|
|
|
|
raise ValueError("something's not right")
|
|
|
|
|
|
|
|
for child in parent_elm.iterchildren():
|
|
|
|
if isinstance(child, CT_P):
|
|
|
|
yield Paragraph(child, parent)
|
|
|
|
elif isinstance(child, CT_Tbl):
|
|
|
|
yield Table(child, parent)
|
|
|
|
|
|
|
|
|
|
|
|
def read_table(table):
|
|
|
|
return [[cell.text for cell in row.cells] for row in table.rows]
|
|
|
|
|
|
|
|
|
|
|
|
def read_word(word_path):
|
|
|
|
paper_text = []
|
|
|
|
doc = docx.Document(word_path)
|
|
|
|
for block in iter_block_items(doc):
|
|
|
|
if isinstance(block, Paragraph):
|
|
|
|
paper_text.append(block.text)
|
|
|
|
elif isinstance(block, Table):
|
|
|
|
table_list = read_table(block)
|
|
|
|
table_list_new = []
|
|
|
|
for row in table_list:
|
|
|
|
table_list_new.append("<td>" + "</td>\n<td>".join(row) + "</td>")
|
|
|
|
table_str = "\n<tr>\n" + "\n</tr>\n<tr>\n".join(table_list_new) + "\n</tr>\n"
|
|
|
|
table_str = "<tbStart>\n<table>" + table_str + "</table>\n\n<tbEnd>"
|
|
|
|
paper_text.append(table_str)
|
|
|
|
paper_text = "\n".join(paper_text)
|
|
|
|
return paper_text
|
|
|
|
|
|
|
|
|
|
|
|
def getText(fileName):
|
|
|
|
'''
|
|
|
|
|
|
|
|
'''
|
|
|
|
doc = docx.Document(fileName)
|
|
|
|
TextList = []
|
|
|
|
for paragraph in doc.paragraphs:
|
|
|
|
TextList.append(paragraph.text)
|
|
|
|
|
|
|
|
return '\n'.join(TextList)
|
|
|
|
|
|
|
|
|
|
|
|
def request_selfmodel_api(prompt):
|
|
|
|
url = "http://192.168.31.149:12004/predict"
|
|
|
|
data = {
|
|
|
|
"model": "gpt-4-turbo-preview",
|
|
|
|
"messages": [
|
|
|
|
{"role": "user", "content": prompt}
|
|
|
|
],
|
|
|
|
"top_p": 0.7,
|
|
|
|
"temperature": 0.95
|
|
|
|
}
|
|
|
|
response = requests.post(
|
|
|
|
url,
|
|
|
|
json=data,
|
|
|
|
timeout=100000
|
|
|
|
)
|
|
|
|
|
|
|
|
return response.json()
|
|
|
|
|
|
|
|
|
|
|
|
def request_chatgpt_api(prompt):
|
|
|
|
'''
|
|
|
|
|
|
|
|
'''
|
|
|
|
OPENAI_API_KEY = "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"
|
|
|
|
url = "https://api.openai.com/v1/chat/completions"
|
|
|
|
# url = "https://one.aiskt.com"
|
|
|
|
headers = {
|
|
|
|
"Content-Type": "application/json",
|
|
|
|
"Authorization": f"Bearer {OPENAI_API_KEY}"
|
|
|
|
}
|
|
|
|
data = {
|
|
|
|
"model": "gpt-4-turbo-preview",
|
|
|
|
"messages": [
|
|
|
|
{"role": "user", "content": prompt}
|
|
|
|
],
|
|
|
|
"top_p": 0.7,
|
|
|
|
"temperature": 0.95
|
|
|
|
}
|
|
|
|
response = requests.post(url,
|
|
|
|
headers=headers,
|
|
|
|
data=json.dumps(data),
|
|
|
|
timeout=1200)
|
|
|
|
|
|
|
|
return response.json()
|
|
|
|
|
|
|
|
|
|
|
|
def yanzhengyijibiaoti(mulu, res):
|
|
|
|
'''
|
|
|
|
判断生成的大标题是否可用
|
|
|
|
:param mulu:
|
|
|
|
:param res:
|
|
|
|
:return:
|
|
|
|
'''
|
|
|
|
|
|
|
|
mulu_list = str(mulu).split("\n")
|
|
|
|
|
|
|
|
dabiaoti_list = []
|
|
|
|
dabiaoti_res_list = []
|
|
|
|
for i in mulu_list:
|
|
|
|
res_re = re.findall(pantten_yijibiaoti, i)
|
|
|
|
if res_re != []:
|
|
|
|
dabiaoti_list.append(re.findall(pantten_yijibiaoti, i)[0])
|
|
|
|
for i in dabiaoti_list:
|
|
|
|
if i[1].strip() in res:
|
|
|
|
dabiaoti_res_list.append("、".join(i))
|
|
|
|
|
|
|
|
if len(dabiaoti_res_list) == 4:
|
|
|
|
return_bool = True
|
|
|
|
else:
|
|
|
|
return_bool = False
|
|
|
|
|
|
|
|
return return_bool, dabiaoti_res_list
|
|
|
|
|
|
|
|
|
|
|
|
def get_document_structure(file_path):
|
|
|
|
doc = docx.Document(file_path)
|
|
|
|
structure = []
|
|
|
|
for paragraph in doc.paragraphs:
|
|
|
|
style = paragraph.style
|
|
|
|
if style.name.startswith("Heading"):
|
|
|
|
level = int(style.name[7:])
|
|
|
|
text = paragraph.text
|
|
|
|
structure.append((level, text))
|
|
|
|
return structure
|
|
|
|
|
|
|
|
def print_document_structure(structure):
|
|
|
|
for level, text in structure:
|
|
|
|
print(f"{' ' * (level - 1)}- {text}")
|
|
|
|
|
|
|
|
def catalogue_extract(structure):
|
|
|
|
|
|
|
|
catalogue_list = []
|
|
|
|
for level, text in structure[:-1]:
|
|
|
|
text = str(text).strip(" ").strip("\n")
|
|
|
|
erji_p = re.findall(pantten_erjibiaoti, text)
|
|
|
|
yiji_p = re.findall(pantten_yijibiaoti, text)
|
|
|
|
if erji_p == [] and yiji_p == []:
|
|
|
|
continue
|
|
|
|
catalogue_list.append(str(text).strip(" ").strip("\n"))
|
|
|
|
catalogue_str = "\n".join(catalogue_list)
|
|
|
|
return catalogue_list, catalogue_str
|
|
|
|
|
|
|
|
|
|
|
|
def main(path):
|
|
|
|
|
|
|
|
# 判断系统
|
|
|
|
system = platform.system()
|
|
|
|
if system == 'Linux':
|
|
|
|
file_name = path.split("/")[-1].split(".")[0]
|
|
|
|
else:
|
|
|
|
file_name = path.split("\\")[-1].split(".")[0]
|
|
|
|
|
|
|
|
text_1 = json.dumps(read_word(path), ensure_ascii=False)
|
|
|
|
print(text_1)
|
|
|
|
|
|
|
|
# mulu_str = re.findall(pantten_mulu, text_1)[0]
|
|
|
|
# print(mulu_str)
|
|
|
|
# mulu_list_xuhao = str(mulu_str).split("\\n")
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# mulu_list = []
|
|
|
|
# for i in mulu_list_xuhao:
|
|
|
|
# if i != "":
|
|
|
|
# mulu_list.append(i.split("\\t")[0])
|
|
|
|
#
|
|
|
|
# mulu_list.append("致谢")
|
|
|
|
#
|
|
|
|
# print(mulu_list)
|
|
|
|
|
|
|
|
document_structure = get_document_structure(path)
|
|
|
|
mulu_list, catalogue_str = catalogue_extract(document_structure)
|
|
|
|
|
|
|
|
text = text_1.split("目录")[-1].strip("\\n")
|
|
|
|
|
|
|
|
yijibiaoti = ""
|
|
|
|
paper_content = {}
|
|
|
|
for i in range(len(mulu_list) - 1):
|
|
|
|
title = mulu_list[i].strip(" ").strip("\\n")
|
|
|
|
print(mulu_list[i])
|
|
|
|
print(mulu_list[i + 1])
|
|
|
|
print(re.findall(pantten_xiaobiaoti.format(mulu_list[i], mulu_list[i + 1]), text))
|
|
|
|
content = str(re.findall(pantten_xiaobiaoti.format(mulu_list[i], mulu_list[i + 1]), text)[0]).strip(
|
|
|
|
" ").strip("\\n")
|
|
|
|
# print(title)
|
|
|
|
# print(content)
|
|
|
|
|
|
|
|
yijibiaoti_res = re.findall(pantten_yijibiaoti, title)
|
|
|
|
erjibiaoti_res = re.findall(pantten_erjibiaoti, title)
|
|
|
|
if yijibiaoti_res != []:
|
|
|
|
# title = "、".join([yijibiaoti_res[0][1], yijibiaoti_res[0][1].strip()])
|
|
|
|
paper_content[title] = {}
|
|
|
|
yijibiaoti = title
|
|
|
|
continue
|
|
|
|
|
|
|
|
elif erjibiaoti_res != []:
|
|
|
|
paper_content[yijibiaoti][title] = content.replace("\\n", "\n")
|
|
|
|
|
|
|
|
else:
|
|
|
|
paper_content[yijibiaoti][title] += "\n".join(title + content)
|
|
|
|
|
|
|
|
while True:
|
|
|
|
mulu_str = "\n".join(mulu_list[:-1])
|
|
|
|
prompt = f'请问把以下目录缩减成只有4个一级标题作为ppt的题目,请问留下原始目录中的哪4个一级标题最合适,一级标题必须在原始目录中\n{mulu_str}\n'
|
|
|
|
|
|
|
|
try:
|
|
|
|
res = request_chatgpt_api(prompt)['choices'][0]['message']['content']
|
|
|
|
except:
|
|
|
|
continue
|
|
|
|
|
|
|
|
# res = '''根据您提供的目录内容,如果要将其缩减为只包含4个一级标题的PPT题目,建议选择以下四个一级标题,因为它们分别代表了研究的引入、理论框架、实际应用与实践,以及未来展望,从而形成了一个完整的研究过程和内容框架:
|
|
|
|
#
|
|
|
|
# 1. 一、绪论
|
|
|
|
# 2. 二、电影网站设计的基本概念
|
|
|
|
# 3. 三、Python在电影网站设计中的应用
|
|
|
|
# 4. 五、电影网站设计的实践与展望
|
|
|
|
#
|
|
|
|
# 这样的选择既涵盖了研究的背景、目的与意义(绪论),也包括了研究的理论基础(电影网站设计的基本概念),以及研究的实际操作和技术实现(Python在电影网站设计中的应用),最后还有对项目实践经验的总结和对未来发展的展望(电影网站设计的实践与展望)。这四个部分共同构成了一个完整的研究报告或项目介绍的框架,能够全面展示电影网站设计项目的各个方面。
|
|
|
|
# '''
|
|
|
|
|
|
|
|
shaixuan_bool, dabiaoti_res_list = yanzhengyijibiaoti("\n".join(mulu_list), res.replace("\n", "\\n"))
|
|
|
|
if shaixuan_bool == True:
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
|
|
index_zahnweifu = 0
|
|
|
|
zhanweifu = []
|
|
|
|
content_1 = []
|
|
|
|
catalogue = []
|
|
|
|
for yijibiaoti in dabiaoti_res_list:
|
|
|
|
content_2 = []
|
|
|
|
yijibiaoti_content = re.findall(pantten_yijibiaoti_content, yijibiaoti)[0]
|
|
|
|
catalogue.append(yijibiaoti_content)
|
|
|
|
for erjibiaoti in paper_content[yijibiaoti]:
|
|
|
|
num = random.randint(2, 6)
|
|
|
|
content = paper_content[yijibiaoti][erjibiaoti]
|
|
|
|
# res = request_selfmodel_api(
|
|
|
|
# f'任务:生成段落主要内容\n请对以下内容进行提取信息,只需要提取{str(num)}条主要内容,使用条数罗列下面这段话的主要信息,例如1. xxx\n2.xxx \n' + content)[
|
|
|
|
# 'choices'][0]['message']['content']
|
|
|
|
|
|
|
|
zhanweifu.append(f'任务:生成段落主要内容\n请对以下内容进行提取信息,只需要提取{str(num)}条主要内容,使用条数罗列下面这段话的主要信息,例如1. xxx\n2.xxx \n' + content)
|
|
|
|
|
|
|
|
content_2.append({
|
|
|
|
"title_small": re.findall(pantten_erjibiaoti_content, erjibiaoti)[0],
|
|
|
|
"content_3": index_zahnweifu
|
|
|
|
})
|
|
|
|
index_zahnweifu += 1
|
|
|
|
content_1.append({
|
|
|
|
"title_big": yijibiaoti_content,
|
|
|
|
"content_2": content_2
|
|
|
|
})
|
|
|
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(100) as executor:
|
|
|
|
results = executor.map(request_selfmodel_api, zhanweifu)
|
|
|
|
|
|
|
|
zhanweifu = []
|
|
|
|
for result in results:
|
|
|
|
res = result['choices'][0]['message']['content']
|
|
|
|
|
|
|
|
tiaoshu_list = str(res).split("\n")
|
|
|
|
|
|
|
|
tiaoshu_list_new = []
|
|
|
|
for dantiao in tiaoshu_list:
|
|
|
|
tiaoshu_list_new.append(re.findall(pantten_content_tiaoshu, dantiao)[0].strip())
|
|
|
|
zhanweifu.append(tiaoshu_list_new)
|
|
|
|
|
|
|
|
content_1_new = []
|
|
|
|
|
|
|
|
for yijibiaoti_content in content_1:
|
|
|
|
content_2_new = []
|
|
|
|
title_big = yijibiaoti_content["title_big"]
|
|
|
|
for erjibiaoti_content in yijibiaoti_content["content_2"]:
|
|
|
|
title_small = erjibiaoti_content["title_small"]
|
|
|
|
content_3 = zhanweifu[erjibiaoti_content["content_3"]]
|
|
|
|
content_2_new.append({
|
|
|
|
"title_small": title_small,
|
|
|
|
"content_3": content_3
|
|
|
|
})
|
|
|
|
content_1_new.append({
|
|
|
|
"title_big": title_big,
|
|
|
|
"content_2": content_2_new
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
data_new = {
|
|
|
|
"title": file_name,
|
|
|
|
"catalogue": catalogue,
|
|
|
|
"content_1": content_1_new
|
|
|
|
}
|
|
|
|
|
|
|
|
# with open("data/ceshi.json", "w", encoding="utf-8") as f:
|
|
|
|
# f.write(json.dumps(data_new, ensure_ascii=False, indent=2))
|
|
|
|
|
|
|
|
return data_new
|
|
|
|
|
|
|
|
@app.route('/predict', methods=['POST'])
|
|
|
|
def upload_file():
|
|
|
|
|
|
|
|
if 'file' not in request.files:
|
|
|
|
return "1"
|
|
|
|
|
|
|
|
file = request.files.get('file')
|
|
|
|
|
|
|
|
if file and allowed_file(file.filename):
|
|
|
|
path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
|
|
|
|
print(path)
|
|
|
|
file.save(path)
|
|
|
|
# file.save(file.filename)
|
|
|
|
|
|
|
|
result = main(path)
|
|
|
|
return_text = {"texts": result, "probabilities": None, "status_code": 200}
|
|
|
|
|
|
|
|
log.log('start at',
|
|
|
|
'filename:{}, result:{}'.format(
|
|
|
|
path, return_text))
|
|
|
|
return jsonify(return_text)
|
|
|
|
else:
|
|
|
|
return "不允许的文件类型"
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
app.run(host="0.0.0.0", port=21000, threaded=True)
|