import tensorflow as tf import numpy as np import os conll2003_path = "D:/ml/conll2003" def load_file(path="/train.txt"): # Load the dataset train_sentences = [] train_labels = [] with open(conll2003_path + path) as f: sentence = [] labels = [] for line in f: line = line.strip() if line: word, pos, chunk, label = line.split() sentence.append(word) labels.append(label) else: train_sentences.append(sentence) train_labels.append(labels) sentence = [] labels = [] return train_sentences, train_labels return X, y def get_dataset(): # Load the dataset train_sentences, train_labels = load_file("/train.txt") valid_sentences, valid_labels = load_file("/valid.txt") test_sentences, test_labels = load_file("/test.txt") # Create vocabulary and tag dictionaries all_sentencses = np.concatenate([train_sentences, valid_sentences, test_sentences]) all_labels = np.concatenate([train_labels, valid_labels, test_labels]) vocab = set() tags = set() for sentence in all_sentencses: for word in sentence: vocab.add(word.lower()) for labels in all_labels: for label in labels: tags.add(label) word2idx = {w: i + 1 for i, w in enumerate(vocab)} tag2idx = {t: i for i, t in enumerate(tags)} save_dict(word2idx, os.path.join('datasetpath', 'word2idx.json')) save_dict(tag2idx, os.path.join('datasetpath', 'idx2Label.json')) num_words = len(word2idx) + 1 num_tags = len(tag2idx) train_X, train_y = preproces(word2idx, tag2idx, num_tags, train_sentences, train_labels); valid_X, valid_y = preproces(word2idx, tag2idx, num_tags, valid_sentences, valid_labels); test_X, test_y = preproces(word2idx, tag2idx, num_tags, test_sentences, test_labels); np.savez(os.path.join('datasetpath', 'dataset.npz'), train_X=train_X, train_y=train_y, valid_X=valid_X, valid_y=valid_y, test_X=test_X, test_y=test_y) return train_X, train_y, valid_X, valid_y, test_X, test_y def load_dataset(): dataset = np.load(os.path.join('datasetpath', 'dataset.npz')) train_X = dataset['train_X'] train_y = dataset['train_y'] valid_X = dataset['valid_X'] valid_y = dataset['valid_y'] test_X = dataset['test_X'] test_y = dataset['test_y'] return train_X, train_y, valid_X, valid_y, test_X, test_y max_len = 64 def save_dict(dict, file_path): import json # Saving the dictionary to a file with open(file_path, 'w') as f: json.dump(dict, f) def load_dict(path_file): import json # Loading the dictionary from the file with open(path_file, 'r') as f: loaded_dict = json.load(f) return loaded_dict; print(loaded_dict) # Output: {'key1': 'value1', 'key2': 'value2'} if __name__ == '__main__': get_dataset() train_X, train_y, valid_X, valid_y, test_X, test_y = load_dataset() print(len(train_X)) print(len(train_y)) print(np.array(train_X).shape) print(np.array(train_y).shape)