defload_training_data(path='data/training_label.txt'): if'training_label'in path: withopen(path, 'r') as f: lines = f.readlines() lines = [line.strip('\n').split(' ') for line in lines] x = [line[2:] for line in lines] y = [line[0] for line in lines] return x, y else: withopen(path, 'r') as f: lines = f.readlines() x = [line.strip('\n').split(' ') for line in lines] return x defload_testing_data(path='data/testing_data'): withopen(path, 'r') as f: lines = f.readlines() X = ["".join(line.strip('\n').split(",")[1:]).strip() for line in lines[1:]] X = [sen.split(' ') for sen in X] return X print("loading training data ...") train_x, train_y = load_training_data('/kaggle/input/ml2020spring-hw4/training_label.txt') train_x_no_label = load_training_data('/kaggle/input/ml2020spring-hw4/training_nolabel.txt') print("loading testing data ...") test_x = load_testing_data('/kaggle/input/ml2020spring-hw4/testing_data.txt') print("loading data end")
读出完成数据之后,Print 数据的一项、查看读取出来的数据格式:
读取数据的函数将一个 sentence 被转化成了一个 wordlist,但我们不能直接将 wordlist 输入我们的 RNN,我们需要将每一个 word 转化成向量。
但为了将其转化成向量,我们需要一个word_vector_dict,这个字典记录了每一个 word 和 vector 的映射关系,但我们并没有这个字典,看题目的要求,我们需要手动训练得到这个字典,想必training_nolabel.txt这个文件的意义就是如此了。
对于用什么模型去训练出这样一个字典,在课程作业介绍视频里推荐我们使用 gensim,在查阅了在 Google 上找的文档后:点我前往,我实现的代码如下:
1 2 3 4 5 6 7 8
from gensim.models import word2vec deftrain_word2vec(x): model = word2vec.Word2Vec(x, size=250, window=5, min_count=5, workers=12, iter=10, sg=1) return model print("trainning model ...") # model = train_word2vec(train_x + test_x + train_x_no_label) model = train_word2vec(train_x + test_x) print("trainning model end ...")
deftransform_wordlist_ids(wordlist, word_index_dict): word_ids = [] for word in wordlist: word_ids.append(word_index_dict.get(word, 2)) return word_ids
deftransform_sentense_list_id_list(sentence_list, word_index_dict): id_list = [] for wordlist in sentence_list: id_list.append(transform_wordlist_ids(wordlist, word_index_dict)) return id_list
Comments