1. ๊ฐ ๋ผ๋ฒจ๋ณ ๊ฐ์ฅ ๋ง์ด ๋์ค๋ ๋จ์ด ์ฐพ๊ธฐ
from collections import Counter
import pandas as pd
df = pd.read_csv('train.csv')
uniqueLabel = df['label'].unique()
for Label in uniqueLabel:
temp_df = df[df['label'] == Label]
words = ' '.join(temp_df['sentence']).split()
word_counts = Counter(words)
most_common_word = word_counts.most_common(5)
print(f"'{Label}'์์ ๊ฐ์ฅ ๋ง์ด ๋์ค๋ ๋จ์ด: {most_common_word}")
2. ๊ฐ ๋ผ๋ฒจ ๋ณ ๋ถํฌ๋ ์ฒดํฌ
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('train.csv')
labels = df['label'].value_counts()
print(labels)
plt.bar(labels.index, labels.values)
plt.show()
3. ๋ฌธ์ฅ ๋ด ํ์ ์ด ๋ฐ ์ด๊ฐ ์ถ์ถ
from konlpy.tag import Kkma, Komoran, Okt, Hannanum
"""
.morphs > ํํ์ ์ถ์ถ
.phrases > ์ด์ ์ถ์ถ
okt.morphs(text, stem= True) > ์ด๊ฐ์ถ์ถ
.nouns > ํ์ฌ ์ค ๋ช
์ฌ๋ง ์ถ์ถ
.pos >๊ฐ ํ์ฌ๋ฅผ ํ๊น
"""
okt = Okt()
kkma = Kkma()
komoran = Komoran()
hannanum = Hannanum()
df = pd.read_csv('train.csv')
tokenized_texts = df['sentence'].apply(lambda x: okt.morphs(x))
print(tokenized_texts)
4. ๋ฐ์ดํฐํ๋ ์ ๋ด ๋จ์ด ๊ฒ์
import pandas as pd
def search_topic(df, topic):
topic_index = []
for i, row in df.iterrows():
if topic.lower() in row['sentence'].lower():
topic_index.append(i)
return topic_index
topic_index = search_topic(df, 'school')
print(topic_index)
print(df['sentence'][topic_index])
728x90
๋ฐ์ํ