You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
108 lines
3.1 KiB
108 lines
3.1 KiB
import tensorflow as tf
|
|
import numpy as np
|
|
import os
|
|
|
|
conll2003_path = "D:/ml/conll2003"
|
|
|
|
|
|
def load_file(path="/train.txt"):
|
|
# Load the dataset
|
|
train_sentences = []
|
|
train_labels = []
|
|
with open(conll2003_path + path) as f:
|
|
sentence = []
|
|
labels = []
|
|
for line in f:
|
|
line = line.strip()
|
|
if line:
|
|
word, pos, chunk, label = line.split()
|
|
sentence.append(word)
|
|
labels.append(label)
|
|
else:
|
|
train_sentences.append(sentence)
|
|
train_labels.append(labels)
|
|
sentence = []
|
|
labels = []
|
|
|
|
return train_sentences, train_labels
|
|
|
|
|
|
return X, y
|
|
|
|
|
|
def get_dataset():
|
|
# Load the dataset
|
|
train_sentences, train_labels = load_file("/train.txt")
|
|
valid_sentences, valid_labels = load_file("/valid.txt")
|
|
test_sentences, test_labels = load_file("/test.txt")
|
|
# Create vocabulary and tag dictionaries
|
|
all_sentencses = np.concatenate([train_sentences, valid_sentences, test_sentences])
|
|
all_labels = np.concatenate([train_labels, valid_labels, test_labels])
|
|
vocab = set()
|
|
tags = set()
|
|
for sentence in all_sentencses:
|
|
for word in sentence:
|
|
vocab.add(word.lower())
|
|
for labels in all_labels:
|
|
for label in labels:
|
|
tags.add(label)
|
|
|
|
word2idx = {w: i + 1 for i, w in enumerate(vocab)}
|
|
tag2idx = {t: i for i, t in enumerate(tags)}
|
|
|
|
save_dict(word2idx, os.path.join('datasetpath', 'word2idx.json'))
|
|
save_dict(tag2idx, os.path.join('datasetpath', 'idx2Label.json'))
|
|
|
|
num_words = len(word2idx) + 1
|
|
num_tags = len(tag2idx)
|
|
|
|
train_X, train_y = preproces(word2idx, tag2idx, num_tags, train_sentences, train_labels);
|
|
valid_X, valid_y = preproces(word2idx, tag2idx, num_tags, valid_sentences, valid_labels);
|
|
test_X, test_y = preproces(word2idx, tag2idx, num_tags, test_sentences, test_labels);
|
|
|
|
np.savez(os.path.join('datasetpath', 'dataset.npz'), train_X=train_X, train_y=train_y, valid_X=valid_X,
|
|
valid_y=valid_y, test_X=test_X, test_y=test_y)
|
|
return train_X, train_y, valid_X, valid_y, test_X, test_y
|
|
|
|
|
|
def load_dataset():
|
|
dataset = np.load(os.path.join('datasetpath', 'dataset.npz'))
|
|
train_X = dataset['train_X']
|
|
train_y = dataset['train_y']
|
|
valid_X = dataset['valid_X']
|
|
valid_y = dataset['valid_y']
|
|
test_X = dataset['test_X']
|
|
test_y = dataset['test_y']
|
|
return train_X, train_y, valid_X, valid_y, test_X, test_y
|
|
|
|
|
|
max_len = 64
|
|
|
|
|
|
def save_dict(dict, file_path):
|
|
import json
|
|
# Saving the dictionary to a file
|
|
with open(file_path, 'w') as f:
|
|
json.dump(dict, f)
|
|
|
|
|
|
def load_dict(path_file):
|
|
import json
|
|
|
|
# Loading the dictionary from the file
|
|
with open(path_file, 'r') as f:
|
|
loaded_dict = json.load(f)
|
|
return loaded_dict;
|
|
|
|
print(loaded_dict) # Output: {'key1': 'value1', 'key2': 'value2'}
|
|
|
|
|
|
if __name__ == '__main__':
|
|
get_dataset()
|
|
train_X, train_y, valid_X, valid_y, test_X, test_y = load_dataset()
|
|
|
|
print(len(train_X))
|
|
print(len(train_y))
|
|
|
|
print(np.array(train_X).shape)
|
|
print(np.array(train_y).shape)
|
|
|