Outlier_pairplot_using_Spark_andSeaborn

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, lit, percentile_approx

# Initialize Spark session
spark = SparkSession.builder.appName("IrisOutliers").getOrCreate()

# Load the Iris dataset into a Spark DataFrame
# For demonstration purposes, using a small dataset
# Replace this with your large dataset
iris = load_iris()
data = iris.data
columns = iris.feature_names

# Convert to Spark DataFrame
df = spark.createDataFrame(pd.DataFrame(data, columns=columns))

# Calculate the 1st and 99th percentiles for each column
percentiles = df.agg(
    *[percentile_approx(col(c), [0.01, 0.99]).alias(c) for c in columns]
).collect()[0]

# Create lower and upper bounds dictionaries
lower_bounds = {c: percentiles[c][0] for c in columns}
upper_bounds = {c: percentiles[c][1] for c in columns}

# Add a new column 'category' to categorize data
for c in columns:
    df = df.withColumn(
        'category',
        when(col(c) < lower_bounds[c], 'lower_outlier')
        .when(col(c) > upper_bounds[c], 'higher_outlier')
        .otherwise(col('category'))
    )

# Fill null values with 'normal'
df = df.fillna('normal', subset=['category'])

# Convert to Pandas DataFrame for visualization
pandas_df = df.toPandas()


import seaborn as sns
import matplotlib.pyplot as plt

# Initialize the PairGrid with the 'category' column as hue
g = sns.PairGrid(pandas_df, hue="category", diag_sharey=False)
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.scatterplot)

# Add a legend
g.add_legend()

plt.show()
##################################################################################################################
## Another version using pandas
##################################################################################################################
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Calculate percentiles
percentiles = df.quantile([0.05, 0.95])

# Function to categorize each value
def categorize_value(val, lower_bound, upper_bound):
    return 'outlier' if val < lower_bound or val > upper_bound else 'normal'

# Apply categorization to each row and create a new column 'category'
def categorize_row(row):
    for col in df.columns:
        lower_bound = percentiles[col].iloc[0]
        upper_bound = percentiles[col].iloc[1]
        if categorize_value(row[col], lower_bound, upper_bound) == 'outlier':
            return 'outlier'
    return 'normal'

df['category'] = df.apply(categorize_row, axis=1)

# Initialize the PairGrid with the 'category' column as hue
g = sns.PairGrid(df, hue="category", diag_sharey=False)
g.map_upper(sns.scatterplot)
#g.map_diag(sns.histplot, color=".1")  # Uncomment for histograms on diagonal
g.map_lower(sns.kdeplot, color=".3")
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.scatterplot)

# Add a legend
g.add_legend()

#################################################
### with Many variables - create a pair plot where the target variable is based on outliers (values greater than the 99th percentile and less than the 1st percentile)
#####################################################

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Load your dataset (here I'm using a random dataset for demonstration)
np.random.seed(42)
data = np.random.rand(1000, 30)
columns = ['NoOd', 'NoE7', 'Pop', 'Wok', 'HH', 'OwnO', 'OwnM', 'SocR', 'PriR', 
           'FV0', 'FV1', 'FV2', 'FV3', 'FV4', 'FV5', 'FV6', 'FV7', 'FV8', 'FV9', 
           'FV10', 'FV11', 'DmA', 'CTA', 'CTB', 'CTC', 'CTD', 'CTE', 'CTF', 'CTG', 'CTH']
df = pd.DataFrame(data, columns=columns)

# Compute the 1st and 99th percentiles
lower_bound = df.quantile(0.01)
upper_bound = df.quantile(0.99)

# Create a new column indicating outliers
df['outlier'] = ((df < lower_bound) | (df > upper_bound)).any(axis=1)

# Map outliers to strings for better visualization
df['outlier'] = df['outlier'].map({True: 'outlier', False: 'normal'})

# Create the pair plot
sns.set(style="ticks")
pair_plot = sns.pairplot(df, hue="outlier", markers=["o", "s"])

# Show the plot
plt.show()

plt.show()
############################################
# Heatmap
############################################
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Example DataFrame
# Replace this with your actual data
np.random.seed(0)
data = np.random.rand(1000, 30)
columns = ['NoOd', 'NoE7', 'Pop', 'Wok', 'HH', 'OwnO', 'OwnM', 'SocR', 'PriR', 
           'FV0', 'FV1', 'FV2', 'FV3', 'FV4', 'FV5', 'FV6', 'FV7', 'FV8', 'FV9', 
           'FV10', 'FV11', 'DmA', 'CTA', 'CTB', 'CTC', 'CTD', 'CTE', 'CTF', 'CTG', 'CTH']

df = pd.DataFrame(data, columns=columns)

# Calculate the correlation matrix
corr = df.corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, annot=False, fmt=".2f", cmap='coolwarm', center=0, 
            square=True, linewidths=.5, cbar_kws={"shrink": .5},
            vmin=-1, vmax=1)

# Adjust the ticks and labels for better visualization
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.tight_layout()

# Show the plot
plt.show()

###############################
## Heatmap - only lwer diagonal
##############################
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Sample DataFrame
np.random.seed(0)
data = np.random.rand(1000, 30)  # Adjusted to 30 columns to match the number of labels
columns = ['NoOd', 'NoE7', 'Pop', 'Wok', 'HH', 'OwnO', 'OwnM', 'SocR', 'PriR', 
           'FV0', 'FV1', 'FV2', 'FV3', 'FV4', 'FV5', 'FV6', 'FV7', 'FV8', 'FV9', 
           'FV10', 'FV11', 'DmA', 'CTA', 'CTB', 'CTC', 'CTD', 'CTE', 'CTF', 'CTG', 'CTH']

df = pd.DataFrame(data, columns=columns)

# Calculate the correlation matrix
corr = df.corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# Draw the heatmap with a mask and correct aspect ratio
mask = np.triu(np.ones_like(corr, dtype=bool))
heatmap = sns.heatmap(corr, mask=mask, annot=False, fmt=".2f", cmap='coolwarm', center=0, 
                      square=True, linewidths=.5, cbar_kws={"shrink": .5}, vmin=-1, vmax=1)

# Customize tick labels
heatmap.set_xticks(np.arange(len(columns)) + 0.5)
heatmap.set_xticklabels(columns, rotation=90)
heatmap.set_yticks(np.arange(len(columns)) + 0.5)
heatmap.set_yticklabels(columns, rotation=0)

plt.tight_layout()
plt.show()

############################
## One more correlogram
############################
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Sample data creation (you should replace this with your actual data)
np.random.seed(0)
data = pd.DataFrame({
    'SCST': np.random.rand(100),
    'female': np.random.rand(100),
    'mmedia': np.random.rand(100),
    'poor': np.random.rand(100),
    'prop4pl': np.random.rand(100),
    'lat': np.random.rand(100),
    'never_breastfeed': np.random.rand(100),
    'low_bmi': np.random.rand(100),
    'hindu': np.random.rand(100),
    'mothage_20': np.random.rand(100),
    'low_birthintval': np.random.rand(100),
    'watersup': np.random.rand(100),
    'hh_fem': np.random.rand(100),
    'edu': np.random.rand(100),
    'ptoilet': np.random.rand(100),
    'urban': np.random.rand(100),
    'pcfuel': np.random.rand(100),
    'vacc': np.random.rand(100)
})

# Calculate the correlation matrix
corr = data.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(240, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, fmt=".1f")

# Adjust the title and labels
plt.title('Correlogram')
plt.show()

###################################
## Another one
##################################
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Sample data creation (replace this with your actual data)
np.random.seed(0)
data = pd.DataFrame({
    'SCST': np.random.rand(100),
    'female': np.random.rand(100),
    'mmedia': np.random.rand(100),
    'poor': np.random.rand(100),
    'prop4pl': np.random.rand(100),
    'lat': np.random.rand(100),
    'never_breastfeed': np.random.rand(100),
    'low_bmi': np.random.rand(100),
    'hindu': np.random.rand(100),
    'mothage_20': np.random.rand(100),
    'low_birthintval': np.random.rand(100),
    'watersup': np.random.rand(100),
    'hh_fem': np.random.rand(100),
    'edu': np.random.rand(100),
    'ptoilet': np.random.rand(100),
    'urban': np.random.rand(100),
    'pcfuel': np.random.rand(100),
    'vacc': np.random.rand(100)
})

# Calculate the correlation matrix
corr = data.corr()

# Generate a mask for the lower triangle
mask = np.tril(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(12, 10))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(240, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, fmt=".1f", annot_kws={"size": 8})


# Rotate the tick labels for better readability
plt.xticks(rotation=90)
plt.yticks(rotation=0)

# Adjust the title and labels
plt.title('Correlogram', fontsize=15)
plt.show()