from sklearn.feature_extraction.text import CountVectorizer # create CountVectorizer object vectorizer = CountVectorizer() corpus = [ 'Text ofthe very first new sentence withthefirstwordsin sentence.', 'Text ofthesecond sentence.', 'Number three with lot ofwordswordswords.', 'Short text, less words.', ]
# learn the vocabulary and store CountVectorizer sparse matrix in term_frequencies term_frequencies = vectorizer.fit_transform(corpus) vocab = vectorizer.get_feature_names()
# convert sparse matrix to numpy array term_frequencies = term_frequencies.toarray()
# visualize term frequencies import seaborn as sns sns.heatmap(term_frequencies, annot=True, cbar = False, xticklabels = vocab);
# Load pretrained vectors from Google model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) king = model['king']
# king - man + woman = queen print(model.most_similar(positive=['woman', 'king'], negative=['man']))
# Load the spacy model that you have installed import en_core_web_lg nlp = en_core_web_lg.load() # process a sentence using the model doc = nlp("man king stands on the carpet and sees woman queen")
找到King和Queen 之间的相似之处(值越高越好)。
1 2
doc[1].similarity(doc[9]) # 0.72526103
Find similarity between King and carpet.
1 2
doc[1].similarity(doc[5]) # 0.20431946
Check if king — man + woman = queen. We will multiply vectors for ‘man’ and ‘woman’ by two, because subtracting one vector for ‘man’ and adding the vector for ‘woman’ will do little to the original vector for “king”, likely because those “man” and “woman” are related themselves.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
v = doc[1].vector - (doc[0].vector*2) + (doc[8].vector*2)
from scipy.spatial import distance import numpy as np
# Format the vocabulary for use in the distance function vectors = [token.vectorfor token in doc] vectors = np.array(vectors)
# Find the closest word below closest_index = distance.cdist(np.expand_dims(v, axis = 0), vectors, metric = 'cosine').argmin() output_word = doc[closest_index].text print(output_word)
# download pre-trained language word vectors from one of 157 languges https://fasttext.cc/docs/en/crawl-vectors.html # it will take some time, about 5 minutes import fasttext import fasttext.util fasttext.util.download_model('en', if_exists='ignore') # English ft = fasttext.load_model('cc.en.300.bin')
ELMo is a deep contextualized word representation that models both (1) complex characteristics of the word use (e.g., syntax and semantics), and (2) how these uses vary across linguistic contexts (i.e., to model polysemy). These word vectors are learned functions of the internal states of a deep bidirectional language model (biLM), which is pre-trained on a large text corpus. They can be easily added to existing models and significantly improve the state of the art across a broad range of challenging NLP problems, including question answering, textual entailment and sentiment analysis.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
# use tensorflow 1.x for ELMo, because trere are still no ELMo for tensorflow 2.0
%tensorflow_version 1.x
import tensorflow_hub as hub import tensorflow as tf
# Download pretrained ELMo model from Tensorflow Hub https://tfhub.dev/google/elmo/3
sentences = \ ['king arthur, also called arthur or aathur pendragon, legendary british king who appears in a cycle of \ medieval romances (known as the matter of britain) as the sovereign of a knightly fellowship of the round table.', 'it is not certain how these legends originated or whether the figure of arthur was based on a historical person.', 'the legend possibly originated either in wales or in those parts of northern britain inhabited by brythonic-speaking celts.', 'for a fuller treatment of the stories about king arthur, see also arthurian legend.']
pca = PCA(n_components=10) y = pca.fit_transform(masked_embs)
from sklearn.manifold import TSNE
y = TSNE(n_components=2).fit_transform(y)
import plotly as py import plotly.graph_objs as go
data = [ go.Scatter( x=[i[0] for i in y], y=[i[1] for i in y], mode='markers', text=[i for i in masked_words], marker=dict( size=16, color = [len(i) for i in masked_words], #set color equal to a variable opacity= 0.8, colorscale='Viridis', showscale=False ) ) ] layout = go.Layout() layout = dict( yaxis = dict(zeroline = False), xaxis = dict(zeroline = False) ) fig = go.Figure(data=data, layout=layout) fig.show()
现在,我们导入pytorch, the pretrained BERT model, and a BERT tokenizer,它将句子转换为适合BERT的输入格式(标记自身并添加特殊标记,例如[SEP]和[CLS])的所有必需工作。
1 2 3 4 5 6 7 8 9 10
import torch torch.manual_seed(0) from transformers import BertTokenizer, BertModel
import logging import matplotlib.pyplot as plt % matplotlib inline
# Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
输入一些句子并将其标记化。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
sentences = \ ['king arthur, also called arthur or aathur pendragon, legendary british king who appears in a cycle of \ medieval romances (known as the matter of britain) as the sovereign of a knightly fellowship of the round table.', 'it is not certain how these legends originated or whether the figure of arthur was based on a historical person.', 'the legend possibly originated either in wales or in those parts of northern britain inhabited by brythonic-speaking celts.', 'for a fuller treatment of the stories about king arthur, see also arthurian legend.']
# Print the original sentence. print(' Original: ', sentences[0][:99])
# Print the sentence splitted into tokens. print('Tokenized: ', tokenizer.tokenize(sentences[0])[:15])
# Print the sentence mapped to token ids. print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0]))[:15])
print('Distance for two kings: %.2f' % kings) print('Distance from king to table: %.2f' % king_table) print('Distance from Archtur to king: %.2f' % king_archtur)
print('Distance for two kings: %.2f' % kings) print('Distance from king to table: %.2f' % king_table) print('Distance from Archtur to king: %.2f' % king_archtur)