ํ ์คํธ ์ ์ฌ๋
์ฝ์ฌ์ธ ์ ์ฌ๋ (Cosine Similarity) | -> ๋ ๊ฐ์ ๋ฒกํฐ ๊ฐ์ Cos ๊ฐ๋ |
์ ํด๋ฆฌ๋์ธ ์ ์ฌ๋ (Euclidean Similarity) | -> ๋ ๊ฐ์ ์ ์ฌ์ด์ ๊ฑฐ๋ฆฌ = L2 ๊ฑฐ๋ฆฌ |
๋งจํํ ์ ์ฌ๋ (Menhattan Similarity) | -> ์ฌ๊ฐ ๊ฒฉ์ ์ต๋จ ๊ฑฐ๋ฆฌ = L1 ๊ฑฐ๋ฆฌ |
์์นด๋ ์ ์ฌ๋ (Jaccard Similarity) | -> ๊ต์งํฉ๊ณผ ํฉ์งํฉ์ ํฌ๊ธฐ๋ก ๊ณ์ฐ |
๋ ๋ฌธ์ฅ์ด ์ฃผ์ด์ก์ ๋, ๋ ๋ฌธ์ฅ์ด ์๋ก ์ผ๋ง๋ ์ ์ฌํ์ง ๋ํ๋ด์ฃผ๋ ๊ธฐ๋ฒ
์๋์์ ์ ๋ ฅ๊ฐ์ผ๋ก ๋ฐ๋ Sentences๋ ["Hello World", "Hello Word"] ํ์์ด๋ค.
### ์ฝ์ฌ์ธ ์ ์ฌ๋ ###
def cos_performance(sentences) :
tfidf_vectorizer = TfidfVectorizer()
# ๋ฌธ์ฅ ๋ฒกํฐํ(์ฌ์ ๋ง๋ค๊ธฐ)
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
cos_similar = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
return cos_similar[0][0]
### ์ ํด๋ฆฌ๋์ธ ์ ์ฌ๋ (๋ ์ ์ฌ์ด์ ๊ฑฐ๋ฆฌ ๊ตฌํ๊ธฐ) ###
def euclidean_performance(sentences) :
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
## ์ ๊ทํ ##
tfidf_normalized = tfidf_matrix/np.sum(tfidf_matrix)
euc_d_norm = euclidean_distances(tfidf_normalized[0:1],tfidf_normalized[1:2])
return euc_d_norm[0][0]
### ๋งจํํ ์ ์ฌ๋(๊ฒฉ์๋ก ๋ ๊ฑฐ๋ฆฌ์์์ ์ต๋จ๊ฑฐ๋ฆฌ) ###
def manhattan_performance(sentences) :
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
## ์ ๊ทํ ##
tfidf_normalized = tfidf_matrix/np.sum(tfidf_matrix)
manhattan_d = manhattan_distances(tfidf_normalized[0:1],tfidf_normalized[1:2])
return manhattan_d[0][0]
Sentence Transformer ์ฌ์ฉ
Sentence Transformer | sentence-transformers/all-MiniLM-L6-v2 |
Sentence Transformer | sentence-transformers/bert-base-nli-mean-tokens |
HuggingFace์ ์ฌ๋ผ์์๋ ๋ฌธ์ฅ ์ ์ฌ๋ ์ธก์ ์ ์ํ Sentence Transformer ์ด๋ค.
def sentence_transformer(sentences) :
seed.set_seed(42)
model_name = "'sentence-transformers/bert-base-nli-mean-tokens" #or sentence-transformers/all-MiniLM-L6-v2
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
tokenizer = AutoTokenizer.from_pretrained(model_name, local_files_only=True)
model = AutoModel.from_pretrained(model_name, local_files_only=True)
model.to(device)
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
encoded_input.to(device)
with torch.no_grad():
model_output = model(**encoded_input)
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
return sentence_embeddings
์ด๋ ๊ฒ ๋ฌธ์ฅ๋ค์ ๋ฌธ์ฅ ์ ์ฌ๋๋ฅผ ๊ตฌํ ์ ์๋ค.
NLI ๋ฐ์ดํฐ์ ์ ๋ฃ์ด์ ์ ์๋ฅผ ๊ตฌํ๊ณ , ์ด๋ฅผ ์๊ฐํ ํ๋ฉด ์๋์ ๊ฐ์ด ์ถ๋ ฅํ ์ ์๋ค.
(์ฐ๊ด, ๋ชจํธ, ๋ชจ์ ๋ฐ์ดํฐ์ )
WordNet ์๋ฏธ ์ ์ฌ๋ ์ธก์
๊ฒฝ๋ก ๊ฑฐ๋ฆฌ ๊ธฐ๋ฐ ์ ์ฌ๋ |
|
Leacock Chordorow ์ ์ฌ๋ |
|
Wu-Palmer ์ ์ฌ๋ |
|
import wordnet
#๊ฒฝ๋ก ๊ฑฐ๋ฆฌ ๊ธฐ๋ฐ ์ ์ฌ๋
right_whale = wordnet.synset('right_whale.n.01')
orca = wordnet.synset('orca.n.01')
right_whale.path_similarity(orca)
#Leacock Chordorow ์ ์ฌ๋
right_whale = wordnet.synset('right_whale.n.01')
orca = wordnet.synset('orca.n.01')
right_whale.lch_similarity(orca)
#Wu-Palmer ์ ์ฌ๋
right_whale = wordnet.synset('right_whale.n.01')
orca = wordnet.synset('orca.n.01')
right_whale.wup_similarity(orca)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('train.csv')
# TF-IDF ๋ฒกํฐํ
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['sentence'])
# ์ฝ์ฌ์ธ ์ ์ฌ๋ ๊ณ์ฐ
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_similarities)
# ํํธ๋งต์ผ๋ก ์ฝ์ฌ์ธ ์ ์ฌ๋ ์๊ฐํ
plt.imshow(cosine_similarities, interpolation='nearest')
plt.colorbar()
plt.show()
728x90
๋ฐ์ํ