import os import sys import json from datetime import datetime import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np from collections import Counter import nltk from nltk.tokenize import word_tokenize import emoji from sklearn.linear_model import LinearRegression import warnings warnings.filterwarnings("ignore", category=UserWarning, message="Converting to PeriodArray/Index representation will drop timezone information.") # Add the parent directory to the Python path script_dir = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.dirname(os.path.dirname(script_dir)) sys.path.insert(0, project_root) from discord_tools.config.settings import DATA_DIR # Download required NLTK resources nltk.download('punkt', quiet=True) nltk.download('punkt_tab', quiet=True) # Set up plot style plt.style.use('ggplot') plt.rcParams['font.family'] = 'DejaVu Sans' class DiscordMessageAnalyzer: def __init__(self, file_name): self.file_path = os.path.join(DATA_DIR, file_name) self.df = self.load_and_process_messages() self.output_dir = os.path.join(DATA_DIR, 'message_analysis') self.combined_dir = os.path.join(self.output_dir, 'combined') self.users = self.df['author'].unique() self.user_dirs = {user: os.path.join(self.output_dir, user) for user in self.users} self.create_output_directories() def load_and_process_messages(self): with open(self.file_path, 'r', encoding='utf-8') as file: messages = json.load(file) df = pd.DataFrame(messages) df['timestamp'] = pd.to_datetime(df['timestamp'], format='ISO8601') df['timestamp'] = df['timestamp'].dt.tz_convert('Europe/Stockholm') df['date'] = df['timestamp'].dt.date df['hour'] = df['timestamp'].dt.hour df['day_of_week'] = df['timestamp'].dt.dayofweek df['month'] = df['timestamp'].dt.month df['year'] = df['timestamp'].dt.year df['word_count'] = df['content'].astype(str).apply(lambda x: len(x.split())) return df def create_output_directories(self): os.makedirs(self.output_dir, exist_ok=True) os.makedirs(self.combined_dir, exist_ok=True) for user_dir in self.user_dirs.values(): os.makedirs(user_dir, exist_ok=True) def analyze_message_frequency(self, df): daily_messages = df.groupby('date').size().reset_index(name='message_count') peak_day = daily_messages.loc[daily_messages['message_count'].idxmax()] avg_messages = daily_messages['message_count'].mean() weekly_messages = df.groupby(df['timestamp'].dt.to_period('W').astype(str)).size() monthly_messages = df.groupby(df['timestamp'].dt.to_period('M').astype(str)).size() return { 'daily_messages': daily_messages, 'peak_day': peak_day, 'avg_messages': avg_messages, 'weekly_messages': weekly_messages, 'monthly_messages': monthly_messages } def visualize_message_frequency(self, data, output_dir): plt.figure(figsize=(12, 6)) sns.lineplot(x='date', y='message_count', data=data['daily_messages']) plt.title('Daily Message Frequency') plt.xlabel('Date') plt.ylabel('Number of Messages') plt.xticks(rotation=45) plt.tight_layout() plt.savefig(os.path.join(output_dir, 'daily_message_frequency.png')) plt.close() fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12)) sns.lineplot(data=data['weekly_messages'], ax=ax1) ax1.set_title('Weekly Message Trend') ax1.set_xlabel('Week') ax1.set_ylabel('Number of Messages') ax1.tick_params(axis='x', rotation=45) sns.lineplot(data=data['monthly_messages'], ax=ax2) ax2.set_title('Monthly Message Trend') ax2.set_xlabel('Month') ax2.set_ylabel('Number of Messages') ax2.tick_params(axis='x', rotation=45) plt.tight_layout() plt.savefig(os.path.join(output_dir, 'message_trends.png')) plt.close() def analyze_timing(self, df): hourly_activity = df.groupby('hour').size() daily_activity = df.groupby('day_of_week').size() return { 'hourly_activity': hourly_activity, 'daily_activity': daily_activity, } def visualize_timing(self, data, output_dir): plt.figure(figsize=(12, 6)) sns.barplot(x=data['hourly_activity'].index, y=data['hourly_activity'].values) plt.title('Hourly Message Activity') plt.xlabel('Hour of Day') plt.ylabel('Number of Messages') plt.tight_layout() plt.savefig(os.path.join(output_dir, 'hourly_activity.png')) plt.close() plt.figure(figsize=(12, 6)) sns.barplot(x=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], y=data['daily_activity'].values) plt.title('Daily Message Activity') plt.xlabel('Day of Week') plt.ylabel('Number of Messages') plt.tight_layout() plt.savefig(os.path.join(output_dir, 'daily_activity.png')) plt.close() def analyze_content(self, df): avg_words = df['word_count'].mean() all_words = ' '.join(df['content'].astype(str).str.lower()) word_freq = Counter(word_tokenize(all_words)) emoji_freq = Counter(char for char in all_words if char in emoji.EMOJI_DATA) return { 'avg_words': avg_words, 'word_freq': word_freq, 'emoji_freq': emoji_freq, } def visualize_content(self, data, output_dir): plt.figure(figsize=(12, 6)) word_freq_df = pd.DataFrame(data['word_freq'].most_common(20), columns=['Word', 'Frequency']) sns.barplot(x='Frequency', y='Word', data=word_freq_df) plt.title('Top 20 Most Common Words') plt.xlabel('Frequency') plt.ylabel('Word') plt.tight_layout() plt.savefig(os.path.join(output_dir, 'word_frequency.png')) plt.close() plt.figure(figsize=(12, 8)) emoji_freq_df = pd.DataFrame(data['emoji_freq'].most_common(20), columns=['Emoji', 'Frequency']) bars = plt.barh(emoji_freq_df['Emoji'], emoji_freq_df['Frequency']) plt.title('Top 20 Most Used Emojis') plt.xlabel('Frequency') plt.ylabel('Emoji') for i, bar in enumerate(bars): width = bar.get_width() plt.text(width, bar.get_y() + bar.get_height()/2, f'{width}', ha='left', va='center') plt.tight_layout() plt.savefig(os.path.join(output_dir, 'emoji_usage.png')) plt.close() def analyze_user_comparison(self): user_stats = {user: { 'message_count': len(df), 'avg_message_length': df['word_count'].mean(), 'most_active_hour': df['hour'].value_counts().index[0], 'most_used_words': Counter(' '.join(df['content'].astype(str)).split()).most_common(10) } for user, df in self.df.groupby('author')} return user_stats def visualize_user_comparison(self, data, output_dir): plt.figure(figsize=(12, 6)) sns.barplot(x=[user for user in self.users], y=[data[user]['message_count'] for user in self.users]) plt.title('Message Count by User') plt.xlabel('User') plt.ylabel('Number of Messages') plt.tight_layout() plt.savefig(os.path.join(output_dir, 'user_message_count.png')) plt.close() plt.figure(figsize=(12, 6)) sns.barplot(x=[user for user in self.users], y=[data[user]['avg_message_length'] for user in self.users]) plt.title('Average Message Length by User') plt.xlabel('User') plt.ylabel('Average Word Count') plt.tight_layout() plt.savefig(os.path.join(output_dir, 'user_avg_message_length.png')) plt.close() def analyze_advanced_analytics(self, df): monthly_activity = df.groupby([df['timestamp'].dt.to_period('M').astype(str), 'author']).size().unstack() X = np.array(range(len(monthly_activity))).reshape(-1, 1) y = monthly_activity.sum(axis=1).values model = LinearRegression().fit(X, y) future_months = np.array(range(len(monthly_activity), len(monthly_activity) + 6)).reshape(-1, 1) predicted_activity = model.predict(future_months) return { 'monthly_activity': monthly_activity, 'predicted_activity': predicted_activity } def visualize_advanced_analytics(self, data, output_dir): plt.figure(figsize=(12, 6)) data['monthly_activity'].plot(kind='area', stacked=True) plt.title('Communication Patterns Over Time') plt.xlabel('Month') plt.ylabel('Number of Messages') plt.legend(title='User', bbox_to_anchor=(1.05, 1), loc='upper left') plt.tight_layout() plt.savefig(os.path.join(output_dir, 'communication_patterns.png')) plt.close() plt.figure(figsize=(12, 6)) plt.plot(range(len(data['monthly_activity'])), data['monthly_activity'].sum(axis=1), label='Actual') plt.plot(range(len(data['monthly_activity']), len(data['monthly_activity']) + 6), data['predicted_activity'], label='Predicted') plt.title('Message Activity Trend and Prediction') plt.xlabel('Months from Start') plt.ylabel('Number of Messages') plt.legend() plt.tight_layout() plt.savefig(os.path.join(output_dir, 'activity_prediction.png')) plt.close() def run_analysis(self): print("Running combined analysis...") self.run_analysis_for_data(self.df, self.combined_dir) for user in self.users: print(f"Running analysis for user: {user}") user_df = self.df[self.df['author'] == user] self.run_analysis_for_data(user_df, self.user_dirs[user]) def run_analysis_for_data(self, df, output_dir): freq_data = self.analyze_message_frequency(df) self.visualize_message_frequency(freq_data, output_dir) timing_data = self.analyze_timing(df) self.visualize_timing(timing_data, output_dir) content_data = self.analyze_content(df) self.visualize_content(content_data, output_dir) if output_dir == self.combined_dir: user_data = self.analyze_user_comparison() self.visualize_user_comparison(user_data, output_dir) advanced_data = self.analyze_advanced_analytics(df) self.visualize_advanced_analytics(advanced_data, output_dir) def main(): file_name = 'message_data.json' analyzer = DiscordMessageAnalyzer(file_name) analyzer.run_analysis() print(f"Analysis complete. Results saved in '{analyzer.output_dir}'.") if __name__ == "__main__": main()