chatgpt-detector/chatgpt_detector_model_pred...

# coding:utf-8
import os
import pandas as pd

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import torch
from transformers import (
    AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding,
    Trainer, TrainingArguments
)
from flask import Flask, jsonify
from flask import request
import uuid
app = Flask(__name__)
app.config["JSON_AS_ASCII"] = False
from threading import Thread
import redis
import uuid
import time
import json
import docx2txt


pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=13, password="zhicheng123*")
redis_ = redis.Redis(connection_pool=pool, decode_responses=True)

db_key_query = 'query'
db_key_querying = 'querying'
db_key_queryset = 'queryset'
batch_size = 32
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("chatgpt-detector-roberta-chinese")
# model = AutoModelForSequenceClassification.from_pretrained("chatgpt-detector-roberta-chinese").cuda()
model = AutoModelForSequenceClassification.from_pretrained("chatgpt-detector-roberta-chinese").cpu()

def model_preidct(text):
    tokenized_text = tokenizer.encode_plus(text, max_length=512, add_special_tokens=True,
                                           truncation=True, return_offsets_mapping=True)

    input_ids = torch.tensor([tokenized_text["input_ids"]])
    token_type = torch.tensor([tokenized_text["token_type_ids"]])

    print(input_ids)

    input_ids, token_type_ids = input_ids.long(), token_type.long()
    # 梯度清零
    # 迁移到GPU
    # batch_masks = input_ids.gt(0).cuda()
    # input_ids, token_type_ids = input_ids.cuda(), token_type_ids.cuda()
    batch_masks = input_ids.gt(0)
    input_ids, token_type_ids = input_ids, token_type_ids
    output = model(input_ids=input_ids, token_type_ids=token_type_ids,
                   attention_mask=batch_masks)  # 这里不需要labels

    output = torch.sigmoid(output[0]).tolist()
    print(output)

    return_list = {
        "humen": output[0][0],
        "robot": output[0][1]
    }
    return return_list


def main(content_list: list):
    '''
    {
        "gpt_content": "    \n    <reference>\n    XXXX大学\n    毕业（设计）论文\n    \n    （校徽）\n    \n    论文题目:        论小区园林景观方案设计思路问题的探讨与分析        \n    专业班级:        \n    学    号:        \n    学生姓名:        \n    指导教师:        \n    电    话:        \n    学院名称:        \n    \n    \n    \n    \n    \n    完成日期:      年   月   日\n    </reference>\n    X X 大 学\n    \n    <reference>\n    毕业论文（设计）原创性声明\n    本人郑重声明:所呈交的论文（设计）是本人在导师的指导下独立进行研究所取得的研究成果。除了文中特别加以标注引用的内容外,本论文不包含任何其他个人或集体已经发表或撰写的成果作品。对本文的研究做出重要贡献的个人和集体,均已在文中以明确方式标明。本人完全意识到本声明的法律后果由本人承担。\n    学生签名:            日期:20    年  月  日\n    </reference>\n    \n    <reference>\n    毕业论文（设计）版权使用授权书\n    本毕业论文（设计）作者完全了解学校有关保留、使用论文（设计）的规定,同意学校保留并向国家有关部门或机构送交论文（设计）的复印件和电子版,允许论文（设计）被查阅和借阅。本人授权XX大学可以将本论文（设计）的全部或部分内容编入有关数据库进行检索,可以采用影印、缩印或扫描等复制手段保存和汇编本论文（设计）。\n    \n    \n    \n    学生签名:               日期:20    年  月  日\n    导师签名:               日期:20    年  月  日\n    \n    </reference>\n    摘要\n    本论文探讨与分析了小区园林景观方案设计思路问题。在引言部分,论文介绍了研究背景、研究目的和研究意义。接着,论文分析了小区园林景观设计的基本原则,包括空间布局原则、植物配置原则和材料选择原则。然后,论文分析了小区园林景观设计的要素,包括地形地貌分析、植被分析和建筑分析。接下来,论文提出了小区园林景观设计的设计思路,包括自然景观与人工景观的融合、功能与美学的平衡以及环境可持续性设计的考虑。在实施问题分析部分,论文分析了设计方案的可行性、施工过程中的问题与解决方案以及维护与管理问题。最后,论文总结了设计思路问题,并讨论了研究的局限性与展望。整篇论文共200字左右。\n    关键词:小区园林景观设计,基本原则,要素,设计思路,实施问题分析\n    Abstract\n    This paper discusses and analyzes the issues related to the design ideas of landscape schemes in residential areas. In the introduction, the paper introduces the research background, research objectives, and research significance. Then, the paper analyzes the basic principles of landscape design in residential areas, including principles of spatial layout, plant configuration, and material selection. Next, the paper analyzes the elements of landscape design in residential areas, including analysis of terrain and landforms, vegetation analysis, and architectural analysis. Subsequently, the paper proposes the design ideas for landscape design in residential areas, including the integration of natural and artificial landscapes, the balance between functionality and aesthetics, and consideration of environmental sustainability in design. In the section on implementation problem analysis, the paper analyzes the feasibility of design schemes, the issues and solutions in the construction process, and maintenance and management issues. Finally, the paper summarizes the design ideas and discusses the limitations and prospects of the research. The entire paper is approximately 200 words.\n    Keyword:Landscape Design of Residential Community:; Basic Principles; Elements; Design Approach; Analysis of Implementation Issues\n    \n    <reference>\n    目录\n    一、引言  6\n    1.1 研究背景  6\n
        "gpt_score_list": "[1, 1, 1, 1, 0.9965, 1, 1, -0.9996, -0.9997, -0.9999, -0.9999, -0.9986, -0.9991, -0.9912, -0.9998, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9992, -0.9999, -0.9999, -0.9998, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9949, -0.9999, -0.9999, -0.9991, -0.9932, -0.9986, -0.9984, -0.9999, 1]",
        "gpt_total_score": "88",
        "total_paragraph": "55",
        "total_words": "12843"
    }
    '''
    return_list = {}
    gpt_content = []
    gpt_score_list = []
    sim_word = 0
    sim_word_5_9 = 0
    total_words = 0
    total_paragraph = len(content_list)

    for i in range(len(content_list)):
        total_words += len(content_list[i])
        res = model_preidct(content_list[i])
        #     return_list = {
        #         "humen": output[0][0],
        #         "robot": output[0][1]
        #     }
        if res["robot"] > 0.9:
            gpt_score_list.append(res["robot"])
            sim_word += len(content_list[i])
            gpt_content.append(
                "<em class=\"similar\" id='score_{}'>".format(str(i)) + content_list[i] + "。\n" + "</em>")
        elif 0.9 > res["robot"] > 0.5:
            gpt_score_list.append(res["robot"])
            sim_word_5_9 += len(content_list[i])
            gpt_content.append(
                "<em class=\"color-gold\" id='score_{}'>".format(str(i)) + content_list[i] + "。\n" + "</em>")
        else:
            gpt_score_list.append(0)
            gpt_content.append(content_list[i] + "。\n")

    return_list["gpt_content"] = "".join(gpt_content)
    return_list["gpt_score_list"] = str(gpt_score_list)
    return_list["total_paragraph"] = str(total_paragraph)
    return_list["total_words"] = str(total_words)
    return_list["gpt_total_score"] = str(round(sim_word/total_words + 0.5 * (sim_word_5_9/total_words), 2) * 100) # round(test,2)

    return return_list


def classify():  # 调用模型，设置最大batch_size
    while True:
        try:
            if redis_.llen(db_key_query) == 0:  # 若队列中没有元素就继续获取
                time.sleep(3)
                continue
            query = redis_.lpop(db_key_query).decode('UTF-8')  # 获取query的text
            print("query", query)
            data_dict_path = json.loads(query)
            id_ = data_dict_path['id']
            path = data_dict_path['path']

            with open(path, encoding='utf8') as f1:
                # 加载文件的对象
                data_dict = json.load(f1)

            queue_uuid = data_dict['id']
            dataBases = data_dict['dataBases']
            minSimilarity = data_dict['minSimilarity']
            minWords = data_dict['minWords']
            title = data_dict['title']
            author = data_dict['author']
            content_list = data_dict['content_list']
            token = data_dict['token']
            account = data_dict['account']
            goodsId = data_dict['goodsId']
            callbackUrl = data_dict['callbackUrl']

            gpt_data_bean = main(content_list)

            format = '%Y-%m-%d %H:%M:%S'
            value = time.localtime(int(time.time()))
            dt = time.strftime(format, value)

            format = '%Y-%m-%d'
            value = time.localtime(int(time.time()))
            time_range_dt = time.strftime(format, value)

            resilt = {
                "author": author,
                "check_time": dt,
                "check_type": "1",
                "gpt_data_bean": gpt_data_bean,
                "order_no": queue_uuid,
                "time_range": "1990-01-01至{}".format(time_range_dt),
                "title": title
            }

            return_text = {"resilt": resilt, "probabilities": None, "status_code": 200}
            load_result_path = "./new_data_logs/{}.json".format(queue_uuid)

            print("query_id: ", queue_uuid)
            print("load_result_path: ", load_result_path)

            with open(load_result_path, 'w', encoding='utf8') as f2:
                # ensure_ascii=False才能输入中文，否则是Unicode字符
                # indent=2 JSON数据的缩进，美观
                json.dump(return_text, f2, ensure_ascii=False, indent=4)
            redis_.set(queue_uuid, load_result_path, 86400)
            redis_.srem(db_key_querying, queue_uuid)
        except:
            continue


if __name__ == '__main__':
    t = Thread(target=classify)
    t.start()
第一次提交 2 years ago			`# coding:utf-8`
			`import os`
			`import pandas as pd`

			`os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"`
			`os.environ["CUDA_VISIBLE_DEVICES"] = "-1"`
			`import torch`
			`from transformers import (`
			`AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding,`
			`Trainer, TrainingArguments`
			`)`
			`from flask import Flask, jsonify`
			`from flask import request`
			`import uuid`
			`app = Flask(__name__)`
			`app.config["JSON_AS_ASCII"] = False`
			`from threading import Thread`
			`import redis`
			`import uuid`
			`import time`
			`import json`
			`import docx2txt`


			`pool = redis.ConnectionPool(host='localhost', port=63179, max_connections=100, db=13, password="zhicheng123*")`
			`redis_ = redis.Redis(connection_pool=pool, decode_responses=True)`

			`db_key_query = 'query'`
			`db_key_querying = 'querying'`
			`db_key_queryset = 'queryset'`
			`batch_size = 32`
			`# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")`
			`tokenizer = AutoTokenizer.from_pretrained("chatgpt-detector-roberta-chinese")`
			`# model = AutoModelForSequenceClassification.from_pretrained("chatgpt-detector-roberta-chinese").cuda()`
			`model = AutoModelForSequenceClassification.from_pretrained("chatgpt-detector-roberta-chinese").cpu()`

			`def model_preidct(text):`
			`tokenized_text = tokenizer.encode_plus(text, max_length=512, add_special_tokens=True,`
			`truncation=True, return_offsets_mapping=True)`

			`input_ids = torch.tensor([tokenized_text["input_ids"]])`
			`token_type = torch.tensor([tokenized_text["token_type_ids"]])`

			`print(input_ids)`

			`input_ids, token_type_ids = input_ids.long(), token_type.long()`
			`# 梯度清零`
			`# 迁移到GPU`
			`# batch_masks = input_ids.gt(0).cuda()`
			`# input_ids, token_type_ids = input_ids.cuda(), token_type_ids.cuda()`
			`batch_masks = input_ids.gt(0)`
			`input_ids, token_type_ids = input_ids, token_type_ids`
			`output = model(input_ids=input_ids, token_type_ids=token_type_ids,`
			`attention_mask=batch_masks) # 这里不需要labels`

			`output = torch.sigmoid(output[0]).tolist()`
			`print(output)`

			`return_list = {`
			`"humen": output[0][0],`
			`"robot": output[0][1]`
			`}`
			`return return_list`



			`def main(content_list: list):`
			`'''`
			`{`
			"gpt_content": " \n <reference>\n XXXX大学\n 毕业（设计）论文\n \n （校徽）\n \n 论文题目: 论小区园林景观方案设计思路问题的探讨与分析 \n 专业班级: \n 学号: \n 学生姓名: \n 指导教师: \n 电话: \n 学院名称: \n \n \n \n \n \n 完成日期: 年月日\n </reference>\n X X 大学\n \n <reference>\n 毕业论文（设计）原创性声明\n 本人郑重声明:所呈交的论文（设计）是本人在导师的指导下独立进行研究所取得的研究成果。除了文中特别加以标注引用的内容外,本论文不包含任何其他个人或集体已经发表或撰写的成果作品。对本文的研究做出重要贡献的个人和集体,均已在文中以明确方式标明。本人完全意识到本声明的法律后果由本人承担。\n 学生签名: 日期:20 年月日\n </reference>\n \n <reference>\n 毕业论文（设计）版权使用授权书\n 本毕业论文（设计）作者完全了解学校有关保留、使用论文（设计）的规定,同意学校保留并向国家有关部门或机构送交论文（设计）的复印件和电子版,允许论文（设计）被查阅和借阅。本人授权XX大学可以将本论文（设计）的全部或部分内容编入有关数据库进行检索,可以采用影印、缩印或扫描等复制手段保存和汇编本论文（设计）。\n \n \n \n 学生签名: 日期:20 年月日\n 导师签名: 日期:20 年月日\n \n </reference>\n 摘要\n 本论文探讨与分析了小区园林景观方案设计思路问题。在引言部分,论文介绍了研究背景、研究目的和研究意义。接着,论文分析了小区园林景观设计的基本原则,包括空间布局原则、植物配置原则和材料选择原则。然后,论文分析了小区园林景观设计的要素,包括地形地貌分析、植被分析和建筑分析。接下来,论文提出了小区园林景观设计的设计思路,包括自然景观与人工景观的融合、功能与美学的平衡以及环境可持续性设计的考虑。在实施问题分析部分,论文分析了设计方案的可行性、施工过程中的问题与解决方案以及维护与管理问题。最后,论文总结了设计思路问题,并讨论了研究的局限性与展望。整篇论文共200字左右。\n 关键词:小区园林景观设计,基本原则,要素,设计思路,实施问题分析\n Abstract\n This paper discusses and analyzes the issues related to the design ideas of landscape schemes in residential areas. In the introduction, the paper introduces the research background, research objectives, and research significance. Then, the paper analyzes the basic principles of landscape design in residential areas, including principles of spatial layout, plant configuration, and material selection. Next, the paper analyzes the elements of landscape design in residential areas, including analysis of terrain and landforms, vegetation analysis, and architectural analysis. Subsequently, the paper proposes the design ideas for landscape design in residential areas, including the integration of natural and artificial landscapes, the balance between functionality and aesthetics, and consideration of environmental sustainability in design. In the section on implementation problem analysis, the paper analyzes the feasibility of design schemes, the issues and solutions in the construction process, and maintenance and management issues. Finally, the paper summarizes the design ideas and discusses the limitations and prospects of the research. The entire paper is approximately 200 words.\n Keyword:Landscape Design of Residential Community:; Basic Principles; Elements; Design Approach; Analysis of Implementation Issues\n \n <reference>\n 目录\n 一、引言 6\n 1.1 研究背景 6\n
			`"gpt_score_list": "[1, 1, 1, 1, 0.9965, 1, 1, -0.9996, -0.9997, -0.9999, -0.9999, -0.9986, -0.9991, -0.9912, -0.9998, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9992, -0.9999, -0.9999, -0.9998, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9999, -0.9949, -0.9999, -0.9999, -0.9991, -0.9932, -0.9986, -0.9984, -0.9999, 1]",`
			`"gpt_total_score": "88",`
			`"total_paragraph": "55",`
			`"total_words": "12843"`
			`}`
			`'''`
			`return_list = {}`
			`gpt_content = []`
			`gpt_score_list = []`
			`sim_word = 0`
			`sim_word_5_9 = 0`
			`total_words = 0`
			`total_paragraph = len(content_list)`

			`for i in range(len(content_list)):`
			`total_words += len(content_list[i])`
			`res = model_preidct(content_list[i])`
			`# return_list = {`
			`# "humen": output[0][0],`
			`# "robot": output[0][1]`
			`# }`
			`if res["robot"] > 0.9:`
			`gpt_score_list.append(res["robot"])`
			`sim_word += len(content_list[i])`
			`gpt_content.append(`
			`"<em class=\"similar\" id='score_{}'>".format(str(i)) + content_list[i] + "。\n" + "</em>")`
			`elif 0.9 > res["robot"] > 0.5:`
			`gpt_score_list.append(res["robot"])`
			`sim_word_5_9 += len(content_list[i])`
			`gpt_content.append(`
			`"<em class=\"color-gold\" id='score_{}'>".format(str(i)) + content_list[i] + "。\n" + "</em>")`
			`else:`
			`gpt_score_list.append(0)`
			`gpt_content.append(content_list[i] + "。\n")`

			`return_list["gpt_content"] = "".join(gpt_content)`
			`return_list["gpt_score_list"] = str(gpt_score_list)`
			`return_list["total_paragraph"] = str(total_paragraph)`
			`return_list["total_words"] = str(total_words)`
			`return_list["gpt_total_score"] = str(round(sim_word/total_words + 0.5 * (sim_word_5_9/total_words), 2) * 100) # round(test,2)`

			`return return_list`


			`def classify(): # 调用模型，设置最大batch_size`
			`while True:`
			`try:`
			`if redis_.llen(db_key_query) == 0: # 若队列中没有元素就继续获取`
			`time.sleep(3)`
			`continue`
			`query = redis_.lpop(db_key_query).decode('UTF-8') # 获取query的text`
			`print("query", query)`
			`data_dict_path = json.loads(query)`
			`id_ = data_dict_path['id']`
			`path = data_dict_path['path']`

			`with open(path, encoding='utf8') as f1:`
			`# 加载文件的对象`
			`data_dict = json.load(f1)`

			`queue_uuid = data_dict['id']`
			`dataBases = data_dict['dataBases']`
			`minSimilarity = data_dict['minSimilarity']`
			`minWords = data_dict['minWords']`
			`title = data_dict['title']`
			`author = data_dict['author']`
			`content_list = data_dict['content_list']`
			`token = data_dict['token']`
			`account = data_dict['account']`
			`goodsId = data_dict['goodsId']`
			`callbackUrl = data_dict['callbackUrl']`

			`gpt_data_bean = main(content_list)`

			`format = '%Y-%m-%d %H:%M:%S'`
			`value = time.localtime(int(time.time()))`
			`dt = time.strftime(format, value)`

			`format = '%Y-%m-%d'`
			`value = time.localtime(int(time.time()))`
			`time_range_dt = time.strftime(format, value)`

			`resilt = {`
			`"author": author,`
			`"check_time": dt,`
			`"check_type": "1",`
			`"gpt_data_bean": gpt_data_bean,`
			`"order_no": queue_uuid,`
			`"time_range": "1990-01-01至{}".format(time_range_dt),`
			`"title": title`
			`}`

			`return_text = {"resilt": resilt, "probabilities": None, "status_code": 200}`
			`load_result_path = "./new_data_logs/{}.json".format(queue_uuid)`

			`print("query_id: ", queue_uuid)`
			`print("load_result_path: ", load_result_path)`

			`with open(load_result_path, 'w', encoding='utf8') as f2:`
			`# ensure_ascii=False才能输入中文，否则是Unicode字符`
			`# indent=2 JSON数据的缩进，美观`
			`json.dump(return_text, f2, ensure_ascii=False, indent=4)`
			`redis_.set(queue_uuid, load_result_path, 86400)`
			`redis_.srem(db_key_querying, queue_uuid)`
			`except:`
			`continue`


			`if __name__ == '__main__':`
			`t = Thread(target=classify)`
			`t.start()`