In [None]:
import nltk

nltk.download('all')

In [None]:
# import libraries
import pandas as pd

import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer

# Load the amazon review dataset

df = pd.read_csv('https://raw.githubusercontent.com/pycaret/pycaret/master/datasets/amazon.csv')

df

In [None]:
# Tokenize the text
idx = 19998
df['reviewText'].iloc[idx]

In [None]:
tokens = word_tokenize(df['reviewText'].iloc[idx].lower())

tokens

In [None]:
# Remove stop words

from nltk.corpus import stopwords
eng_stopwords = stopwords.words('english')
eng_stopwords[:10]

In [None]:
filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

filtered_tokens

In [None]:
# Lemmatize the tokens

lemmatizer = WordNetLemmatizer()

lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

lemmatized_tokens

In [None]:
processed_text = ' '.join(lemmatized_tokens)
processed_text

In [None]:
df['reviewText'].iloc[idx]

In [None]:
# create preprocess_text function
def preprocess_text(text):

    # Tokenize the text

    tokens = word_tokenize(text.lower())

    # Remove stop words

    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatize the tokens

    lemmatizer = WordNetLemmatizer()

    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string

    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

# apply the function df

df['reviewText'] = df['reviewText'].apply(preprocess_text)
df

In [None]:
# initialize NLTK sentiment analyzer

analyzer = SentimentIntensityAnalyzer()


# create get_sentiment function

def get_sentiment(text):

    scores = analyzer.polarity_scores(text)

    sentiment = 1 if scores['pos'] > 0 else 0

    return sentiment




# apply get_sentiment function

df['sentiment'] = df['reviewText'].apply(get_sentiment)

df

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(df['Positive'], df['sentiment']))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(df['Positive'], df['sentiment']))

In [None]:
# http://text-processing.com/demo/

In [None]:
from nltk import FreqDist, bigrams
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd

# Combine all reviews into one text to analyze bigrams across the dataset
all_text = " ".join(df['reviewText'].values)

# Tokenize and calculate bigram frequencies
words = word_tokenize(all_text.lower())
bigrams_list = list(bigrams(words))
bigram_freq = FreqDist(bigrams_list)

# Get the top 10 most common bigrams
top_bigrams = bigram_freq.most_common(10)

# Prepare data for visualization
bigram_words = [" ".join(bigram) for bigram, freq in top_bigrams]
bigram_counts = [freq for bigram, freq in top_bigrams]

# Plot
plt.figure(figsize=(10, 6))
plt.barh(bigram_words, bigram_counts, color="skyblue")
plt.xlabel("Frequency")
plt.title("Top 10 Most Common Bigrams")
plt.gca().invert_yaxis()  # Invert y-axis to have the most frequent bigram at the top
plt.show()


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Separate positive and negative reviews
positive_reviews = " ".join(df[df['sentiment'] == 1]['reviewText'])
negative_reviews = " ".join(df[df['sentiment'] == 0]['reviewText'])

# Generate word clouds
positive_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(positive_reviews)
negative_wordcloud = WordCloud(width=800, height=400, background_color='black').generate(negative_reviews)

# Plot
plt.figure(figsize=(14, 7))
plt.subplot(1, 2, 1)
plt.imshow(positive_wordcloud, interpolation='bilinear')
plt.title("Positive Reviews Word Cloud")
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(negative_wordcloud, interpolation='bilinear')
plt.title("Negative Reviews Word Cloud")
plt.axis('off')

plt.show()
