|
##### remove URLs |
|
def remove_url(txt): |
|
return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split()) |
|
df.text = df.text.apply(lambda x: remove_url(x)) |
|
|
|
|
|
##### remove Punctuations except !? |
|
def remove_punct(text): |
|
new_punct = re.sub('\ |\!|\?', '', punctuation) |
|
table=str.maketrans('','',new_punct) |
|
return text.translate(table) |
|
df.text = df.text.apply(lambda x: remove_punct(x)) |
|
|
|
|
|
##### Remove Emojis |
|
def remove_emoji(text): |
|
emoji_pattern = re.compile("[" |
|
u"\U0001F600-\U0001F64F" # emoticons |
|
u"\U0001F300-\U0001F5FF" # symbols & pictographs |
|
u"\U0001F680-\U0001F6FF" # transport & map symbols |
|
u"\U0001F1E0-\U0001F1FF" # flags (iOS) |
|
u"\U00002702-\U000027B0" |
|
u"\U000024C2-\U0001F251" |
|
"]+", flags=re.UNICODE) |
|
return emoji_pattern.sub(r'', text) |
|
df.text = df.text.apply(lambda x: remove_emoji(x)) |
|
|
|
|
|
##### Remove HTML tags |
|
def remove_html(text): |
|
# . 은 \n(엔터)을 제외한 모든 문자와 매치 (점 하나는 글자 하나를 의미) |
|
# * 은 0회 이상 반복(없어도 상관 없음) |
|
html = re.compile(r'<.*?>') |
|
return html.sub(r'',text) |
|
df.text = df.text.apply(lambda x: remove_html(x)) |
|
|
|
|
|
##### Remove Stopwords |
|
stop = set(stopwords.words('english')) |
|
def remove_stopwords(text): |
|
word_tokens = word_tokenize(text) |
|
return ' '.join([w.lower() for w in word_tokens if not w.lower() in stop]) |
|
df['text_nostopwords'] = df.text.apply(lambda x: remove_stopwords(x)) |
|
|
|
|
|
|
|
##### 축약형을 풀어주기 - won't를 will not으로 |
|
def decontraction(phrase): |
|
# specific |
|
phrase = re.sub(r"won\'t", "will not", phrase) |
|
phrase = re.sub(r"can\'t", "can not", phrase) |
|
# general |
|
phrase = re.sub(r"n\'t", " not", phrase) |
|
phrase = re.sub(r"\'re", " are", phrase) |
|
phrase = re.sub(r"\'s", " is", phrase) |
|
phrase = re.sub(r"\'d", " would", phrase) |
|
phrase = re.sub(r"\'ll", " will", phrase) |
|
phrase = re.sub(r"\'t", " not", phrase) |
|
phrase = re.sub(r"\'ve", " have", phrase) |
|
phrase = re.sub(r"\'m", " am", phrase) |
|
return phrase |
|
df.text = [decontraction(tweet) for tweet in df.text] |
|
|
|
|
|
##### Lemmatization 표제어 - 활용어(inflected words)를 root단어로 바꾸기 |
|
# 위에서 stop = set(stopwords.words('english'))로 불용어 사전을 만들어 두었음 |
|
lemmatizer = WordNetLemmatizer() |
|
def lemma(text): |
|
words = word_tokenize(text) |
|
return ' '.join([lemmatizer.lemmatize(w.lower(), pos='v') for w in words]) |
|
df.text = df.text.apply(lambda x: lemma(x)) |