Le plongement de mot est une représentation des données textuelle qui consiste à associer un vecteur à chaque mot.
Gensim est un package Python dédié au NLP. Il propose qui permet de
Gensim peut être installé depuis le répertoire Pypi avec pip
pip install gensim
!pip install -q gensim
import os import pickle import re import unicodedata import nltk import numpy as np from gensim.models import Word2Vec
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') def preprocess(line): def remove_no_latin(string): return unicodedata \ .normalize('NFKD', string) \ .encode('ASCII', 'ignore') \ .decode('utf-8') new_line = remove_no_latin(line) new_line = re.sub(r"(@[A-Za-z]+)|([^A-Za-z \t])|(\w+:\/\/\S+)", " ", new_line) new_line = new_line.lower() tokens = tokenizer.tokenize(new_line) return tokens
class MyCorpus(object): def __init__(self, dirname): self.dirname = dirname def __iter__(self): for fname in os.listdir(self.dirname): for line in open(os.path.join(self.dirname, fname)): yield preprocess(line)
sentences = MyCorpus(dirname=DATA_PATH)
model = Word2Vec(sentences=sentences, size=EMBEDDING_DIM, sg=0, hs=1, min_count=10)
model.wv.most_similar('musique')
[('le', 0.9998461008071899), ('lannee', 0.9998108148574829), ('disque', 0.9998019933700562), ('il', 0.9998003244400024), ('ses', 0.9997894763946533), ('carriere', 0.9997774362564087), ('apres', 0.9997769594192505), ('on', 0.9997597932815552), ('ou', 0.9997404217720032), ('en', 0.9997389316558838)]
# save the weights output_file = os.path.join(EMBEDDING_PATH, f'embedding-word2vec-{EMBEDDING_DIM}d.txt') if not os.path.exists(EMBEDDING_PATH): os.makedirs(EMBEDDING_PATH) model.wv.save_word2vec_format(output_file, binary=False)