Initial commit: Discord automation tools
This commit is contained in:
267
scripts/discord_message_analyzer.py
Executable file
267
scripts/discord_message_analyzer.py
Executable file
@@ -0,0 +1,267 @@
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
from datetime import datetime
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from collections import Counter
|
||||
import nltk
|
||||
from nltk.tokenize import word_tokenize
|
||||
import emoji
|
||||
from sklearn.linear_model import LinearRegression
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore", category=UserWarning, message="Converting to PeriodArray/Index representation will drop timezone information.")
|
||||
|
||||
# Add the parent directory to the Python path
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = os.path.dirname(os.path.dirname(script_dir))
|
||||
sys.path.insert(0, project_root)
|
||||
|
||||
from discord_tools.config.settings import DATA_DIR
|
||||
|
||||
# Download required NLTK resources
|
||||
nltk.download('punkt', quiet=True)
|
||||
nltk.download('punkt_tab', quiet=True)
|
||||
|
||||
# Set up plot style
|
||||
plt.style.use('ggplot')
|
||||
plt.rcParams['font.family'] = 'DejaVu Sans'
|
||||
|
||||
class DiscordMessageAnalyzer:
|
||||
def __init__(self, file_name):
|
||||
self.file_path = os.path.join(DATA_DIR, file_name)
|
||||
self.df = self.load_and_process_messages()
|
||||
self.output_dir = os.path.join(DATA_DIR, 'message_analysis')
|
||||
self.combined_dir = os.path.join(self.output_dir, 'combined')
|
||||
self.users = self.df['author'].unique()
|
||||
self.user_dirs = {user: os.path.join(self.output_dir, user) for user in self.users}
|
||||
self.create_output_directories()
|
||||
|
||||
def load_and_process_messages(self):
|
||||
with open(self.file_path, 'r', encoding='utf-8') as file:
|
||||
messages = json.load(file)
|
||||
|
||||
df = pd.DataFrame(messages)
|
||||
df['timestamp'] = pd.to_datetime(df['timestamp'], format='ISO8601')
|
||||
df['timestamp'] = df['timestamp'].dt.tz_convert('Europe/Stockholm')
|
||||
df['date'] = df['timestamp'].dt.date
|
||||
df['hour'] = df['timestamp'].dt.hour
|
||||
df['day_of_week'] = df['timestamp'].dt.dayofweek
|
||||
df['month'] = df['timestamp'].dt.month
|
||||
df['year'] = df['timestamp'].dt.year
|
||||
df['word_count'] = df['content'].astype(str).apply(lambda x: len(x.split()))
|
||||
return df
|
||||
|
||||
def create_output_directories(self):
|
||||
os.makedirs(self.output_dir, exist_ok=True)
|
||||
os.makedirs(self.combined_dir, exist_ok=True)
|
||||
for user_dir in self.user_dirs.values():
|
||||
os.makedirs(user_dir, exist_ok=True)
|
||||
|
||||
def analyze_message_frequency(self, df):
|
||||
daily_messages = df.groupby('date').size().reset_index(name='message_count')
|
||||
peak_day = daily_messages.loc[daily_messages['message_count'].idxmax()]
|
||||
avg_messages = daily_messages['message_count'].mean()
|
||||
weekly_messages = df.groupby(df['timestamp'].dt.to_period('W').astype(str)).size()
|
||||
monthly_messages = df.groupby(df['timestamp'].dt.to_period('M').astype(str)).size()
|
||||
|
||||
return {
|
||||
'daily_messages': daily_messages,
|
||||
'peak_day': peak_day,
|
||||
'avg_messages': avg_messages,
|
||||
'weekly_messages': weekly_messages,
|
||||
'monthly_messages': monthly_messages
|
||||
}
|
||||
|
||||
def visualize_message_frequency(self, data, output_dir):
|
||||
plt.figure(figsize=(12, 6))
|
||||
sns.lineplot(x='date', y='message_count', data=data['daily_messages'])
|
||||
plt.title('Daily Message Frequency')
|
||||
plt.xlabel('Date')
|
||||
plt.ylabel('Number of Messages')
|
||||
plt.xticks(rotation=45)
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(output_dir, 'daily_message_frequency.png'))
|
||||
plt.close()
|
||||
|
||||
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))
|
||||
sns.lineplot(data=data['weekly_messages'], ax=ax1)
|
||||
ax1.set_title('Weekly Message Trend')
|
||||
ax1.set_xlabel('Week')
|
||||
ax1.set_ylabel('Number of Messages')
|
||||
ax1.tick_params(axis='x', rotation=45)
|
||||
|
||||
sns.lineplot(data=data['monthly_messages'], ax=ax2)
|
||||
ax2.set_title('Monthly Message Trend')
|
||||
ax2.set_xlabel('Month')
|
||||
ax2.set_ylabel('Number of Messages')
|
||||
ax2.tick_params(axis='x', rotation=45)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(output_dir, 'message_trends.png'))
|
||||
plt.close()
|
||||
|
||||
def analyze_timing(self, df):
|
||||
hourly_activity = df.groupby('hour').size()
|
||||
daily_activity = df.groupby('day_of_week').size()
|
||||
return {
|
||||
'hourly_activity': hourly_activity,
|
||||
'daily_activity': daily_activity,
|
||||
}
|
||||
|
||||
def visualize_timing(self, data, output_dir):
|
||||
plt.figure(figsize=(12, 6))
|
||||
sns.barplot(x=data['hourly_activity'].index, y=data['hourly_activity'].values)
|
||||
plt.title('Hourly Message Activity')
|
||||
plt.xlabel('Hour of Day')
|
||||
plt.ylabel('Number of Messages')
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(output_dir, 'hourly_activity.png'))
|
||||
plt.close()
|
||||
|
||||
plt.figure(figsize=(12, 6))
|
||||
sns.barplot(x=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'], y=data['daily_activity'].values)
|
||||
plt.title('Daily Message Activity')
|
||||
plt.xlabel('Day of Week')
|
||||
plt.ylabel('Number of Messages')
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(output_dir, 'daily_activity.png'))
|
||||
plt.close()
|
||||
|
||||
def analyze_content(self, df):
|
||||
avg_words = df['word_count'].mean()
|
||||
all_words = ' '.join(df['content'].astype(str).str.lower())
|
||||
word_freq = Counter(word_tokenize(all_words))
|
||||
emoji_freq = Counter(char for char in all_words if char in emoji.EMOJI_DATA)
|
||||
|
||||
return {
|
||||
'avg_words': avg_words,
|
||||
'word_freq': word_freq,
|
||||
'emoji_freq': emoji_freq,
|
||||
}
|
||||
|
||||
def visualize_content(self, data, output_dir):
|
||||
plt.figure(figsize=(12, 6))
|
||||
word_freq_df = pd.DataFrame(data['word_freq'].most_common(20), columns=['Word', 'Frequency'])
|
||||
sns.barplot(x='Frequency', y='Word', data=word_freq_df)
|
||||
plt.title('Top 20 Most Common Words')
|
||||
plt.xlabel('Frequency')
|
||||
plt.ylabel('Word')
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(output_dir, 'word_frequency.png'))
|
||||
plt.close()
|
||||
|
||||
plt.figure(figsize=(12, 8))
|
||||
emoji_freq_df = pd.DataFrame(data['emoji_freq'].most_common(20), columns=['Emoji', 'Frequency'])
|
||||
bars = plt.barh(emoji_freq_df['Emoji'], emoji_freq_df['Frequency'])
|
||||
plt.title('Top 20 Most Used Emojis')
|
||||
plt.xlabel('Frequency')
|
||||
plt.ylabel('Emoji')
|
||||
for i, bar in enumerate(bars):
|
||||
width = bar.get_width()
|
||||
plt.text(width, bar.get_y() + bar.get_height()/2, f'{width}', ha='left', va='center')
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(output_dir, 'emoji_usage.png'))
|
||||
plt.close()
|
||||
|
||||
def analyze_user_comparison(self):
|
||||
user_stats = {user: {
|
||||
'message_count': len(df),
|
||||
'avg_message_length': df['word_count'].mean(),
|
||||
'most_active_hour': df['hour'].value_counts().index[0],
|
||||
'most_used_words': Counter(' '.join(df['content'].astype(str)).split()).most_common(10)
|
||||
} for user, df in self.df.groupby('author')}
|
||||
return user_stats
|
||||
|
||||
def visualize_user_comparison(self, data, output_dir):
|
||||
plt.figure(figsize=(12, 6))
|
||||
sns.barplot(x=[user for user in self.users], y=[data[user]['message_count'] for user in self.users])
|
||||
plt.title('Message Count by User')
|
||||
plt.xlabel('User')
|
||||
plt.ylabel('Number of Messages')
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(output_dir, 'user_message_count.png'))
|
||||
plt.close()
|
||||
|
||||
plt.figure(figsize=(12, 6))
|
||||
sns.barplot(x=[user for user in self.users], y=[data[user]['avg_message_length'] for user in self.users])
|
||||
plt.title('Average Message Length by User')
|
||||
plt.xlabel('User')
|
||||
plt.ylabel('Average Word Count')
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(output_dir, 'user_avg_message_length.png'))
|
||||
plt.close()
|
||||
|
||||
def analyze_advanced_analytics(self, df):
|
||||
monthly_activity = df.groupby([df['timestamp'].dt.to_period('M').astype(str), 'author']).size().unstack()
|
||||
|
||||
X = np.array(range(len(monthly_activity))).reshape(-1, 1)
|
||||
y = monthly_activity.sum(axis=1).values
|
||||
model = LinearRegression().fit(X, y)
|
||||
future_months = np.array(range(len(monthly_activity), len(monthly_activity) + 6)).reshape(-1, 1)
|
||||
predicted_activity = model.predict(future_months)
|
||||
|
||||
return {
|
||||
'monthly_activity': monthly_activity,
|
||||
'predicted_activity': predicted_activity
|
||||
}
|
||||
|
||||
def visualize_advanced_analytics(self, data, output_dir):
|
||||
plt.figure(figsize=(12, 6))
|
||||
data['monthly_activity'].plot(kind='area', stacked=True)
|
||||
plt.title('Communication Patterns Over Time')
|
||||
plt.xlabel('Month')
|
||||
plt.ylabel('Number of Messages')
|
||||
plt.legend(title='User', bbox_to_anchor=(1.05, 1), loc='upper left')
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(output_dir, 'communication_patterns.png'))
|
||||
plt.close()
|
||||
|
||||
plt.figure(figsize=(12, 6))
|
||||
plt.plot(range(len(data['monthly_activity'])), data['monthly_activity'].sum(axis=1), label='Actual')
|
||||
plt.plot(range(len(data['monthly_activity']), len(data['monthly_activity']) + 6), data['predicted_activity'], label='Predicted')
|
||||
plt.title('Message Activity Trend and Prediction')
|
||||
plt.xlabel('Months from Start')
|
||||
plt.ylabel('Number of Messages')
|
||||
plt.legend()
|
||||
plt.tight_layout()
|
||||
plt.savefig(os.path.join(output_dir, 'activity_prediction.png'))
|
||||
plt.close()
|
||||
|
||||
def run_analysis(self):
|
||||
print("Running combined analysis...")
|
||||
self.run_analysis_for_data(self.df, self.combined_dir)
|
||||
|
||||
for user in self.users:
|
||||
print(f"Running analysis for user: {user}")
|
||||
user_df = self.df[self.df['author'] == user]
|
||||
self.run_analysis_for_data(user_df, self.user_dirs[user])
|
||||
|
||||
def run_analysis_for_data(self, df, output_dir):
|
||||
freq_data = self.analyze_message_frequency(df)
|
||||
self.visualize_message_frequency(freq_data, output_dir)
|
||||
|
||||
timing_data = self.analyze_timing(df)
|
||||
self.visualize_timing(timing_data, output_dir)
|
||||
|
||||
content_data = self.analyze_content(df)
|
||||
self.visualize_content(content_data, output_dir)
|
||||
|
||||
if output_dir == self.combined_dir:
|
||||
user_data = self.analyze_user_comparison()
|
||||
self.visualize_user_comparison(user_data, output_dir)
|
||||
|
||||
advanced_data = self.analyze_advanced_analytics(df)
|
||||
self.visualize_advanced_analytics(advanced_data, output_dir)
|
||||
|
||||
def main():
|
||||
file_name = 'message_data.json'
|
||||
analyzer = DiscordMessageAnalyzer(file_name)
|
||||
analyzer.run_analysis()
|
||||
print(f"Analysis complete. Results saved in '{analyzer.output_dir}'.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user