排版识别标题级别和正文
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

108 lines
3.1 KiB

import tensorflow as tf
import numpy as np
import os
conll2003_path = "D:/ml/conll2003"
def load_file(path="/train.txt"):
# Load the dataset
train_sentences = []
train_labels = []
with open(conll2003_path + path) as f:
sentence = []
labels = []
for line in f:
line = line.strip()
if line:
word, pos, chunk, label = line.split()
sentence.append(word)
labels.append(label)
else:
train_sentences.append(sentence)
train_labels.append(labels)
sentence = []
labels = []
return train_sentences, train_labels
return X, y
def get_dataset():
# Load the dataset
train_sentences, train_labels = load_file("/train.txt")
valid_sentences, valid_labels = load_file("/valid.txt")
test_sentences, test_labels = load_file("/test.txt")
# Create vocabulary and tag dictionaries
all_sentencses = np.concatenate([train_sentences, valid_sentences, test_sentences])
all_labels = np.concatenate([train_labels, valid_labels, test_labels])
vocab = set()
tags = set()
for sentence in all_sentencses:
for word in sentence:
vocab.add(word.lower())
for labels in all_labels:
for label in labels:
tags.add(label)
word2idx = {w: i + 1 for i, w in enumerate(vocab)}
tag2idx = {t: i for i, t in enumerate(tags)}
save_dict(word2idx, os.path.join('datasetpath', 'word2idx.json'))
save_dict(tag2idx, os.path.join('datasetpath', 'idx2Label.json'))
num_words = len(word2idx) + 1
num_tags = len(tag2idx)
train_X, train_y = preproces(word2idx, tag2idx, num_tags, train_sentences, train_labels);
valid_X, valid_y = preproces(word2idx, tag2idx, num_tags, valid_sentences, valid_labels);
test_X, test_y = preproces(word2idx, tag2idx, num_tags, test_sentences, test_labels);
np.savez(os.path.join('datasetpath', 'dataset.npz'), train_X=train_X, train_y=train_y, valid_X=valid_X,
valid_y=valid_y, test_X=test_X, test_y=test_y)
return train_X, train_y, valid_X, valid_y, test_X, test_y
def load_dataset():
dataset = np.load(os.path.join('datasetpath', 'dataset.npz'))
train_X = dataset['train_X']
train_y = dataset['train_y']
valid_X = dataset['valid_X']
valid_y = dataset['valid_y']
test_X = dataset['test_X']
test_y = dataset['test_y']
return train_X, train_y, valid_X, valid_y, test_X, test_y
max_len = 64
def save_dict(dict, file_path):
import json
# Saving the dictionary to a file
with open(file_path, 'w') as f:
json.dump(dict, f)
def load_dict(path_file):
import json
# Loading the dictionary from the file
with open(path_file, 'r') as f:
loaded_dict = json.load(f)
return loaded_dict;
print(loaded_dict) # Output: {'key1': 'value1', 'key2': 'value2'}
if __name__ == '__main__':
get_dataset()
train_X, train_y, valid_X, valid_y, test_X, test_y = load_dataset()
print(len(train_X))
print(len(train_y))
print(np.array(train_X).shape)
print(np.array(train_y).shape)