-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_cleaner.py
More file actions
25 lines (23 loc) · 1.06 KB
/
Copy pathtext_cleaner.py
File metadata and controls
25 lines (23 loc) · 1.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import re
import spacy
def clean_text(text):
text = text.lower() # Lowercase conversion
text = re.sub(r'http\S+|www\S+|https\S+', '', text) # URL removal
text = re.sub(r'<.*?>', '', text) # HTML tag removal
text = re.sub(r'[^a-z\s]', '', text) # Special character removal
text = re.sub(r'\s+', ' ', text).strip() # Extra whitespace removal #strip to remove forward or backward spcaes
text = re.sub(r'\S+@\S+\.\S+', '', text) # email removal
text = re.sub(r'[^\w\s]', '', text) # punctuations
text = re.sub(r'(.)\1{2,}', r'\1', text) # Repeated Characters / Elongated Words
text = re.sub(r'#', '', text) # Hashtags (like from Twitter/Instagram)
text= re.sub(r'@\w+', '', text) # Mention username
text= re.sub(r'[^\x00-\x7F]+', '', text) # non ASCII characters (emojis , foreign characters)
return text
def clean_text_spacy(text):
nlp = spacy.load('en_core_web_sm')
doc=nlp(text)
word=[]
for token in doc:
if not token.is_stop and not token.is_punct:
word.append(token.lemma_)
return word