Majority of sample (n=4106) is male (n=2626) and majority of sample are in the age range: 6-10 (n=2394)

import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
import umap
import plotly.io as pio

from hbn.data import make_dataset
from hbn.features.build_features import get_features
from hbn.visualization import visualize as vis

%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

# pio.renderers.default = 'iframe'

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!

## Functions

def piechart(dataframe, y='PreInt_DevHx,birthweight_lbs', hue='DX_01'):
    
    if hue:
        df_grouped = dataframe.groupby(hue).agg({y: 'mean'}).reset_index()
        fig = px.pie(df_grouped, names=hue, values=y)
    else:
        fig = px.pie(dataframe, names=y)
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.show()
    
def race(x):
    race_dict = {
        0: "White/Caucasian",
        1:"Black/African American",
        2:"Hispanic",
        3:"Asian",
        4:"Indian",
        5:"Native American Indian",
        6:"American Indian/Alaskan Native",
        7:"Native Hawaiian/Other Pacific Islander",
        8:"Two or more races",
        9:"Other race",
        10:"Unknown",
        11:"Choose not to specify"
        }
    return race_dict[x]

# load diagnosis + demographic data + parent intake

df = make_dataset.make_summary()
df_CGAS = make_dataset.add_CGAS_Score(df)

df_intake = get_features(
    assessment='Parent Measures', 
    domains=['Interview_of_Emotional_and_Psychological_Function'], 
    measures=['Intake_Interview']
)
df_intake = df_intake.merge(df, on='Identifiers')
df_intake['PreInt_Demos_Fam,Child_Race_cat'] = df_intake['PreInt_Demos_Fam,Child_Race'].fillna(10).apply(lambda x: race(x))

reading /Users/maedbhking/Documents/healthy_brain_network/data/raw/phenotype/Parent_Measures/Interview_of_Emotional_and_Psychological_Function/Intake_Interview.csv into dataframe

## summary stats

sample_size = len(df['Identifiers'].unique())
print(f'1. total sample size is {sample_size}\n')

sex = df['Sex'].value_counts()
print(f'2. there are {sex.male} males and {sex.female} females\n')

ages_6_10 = len(df[df['Age'].round()<=10])
ages_10_21 = len(df[df['Age'].round()>10])
print(f'3. there are {ages_6_10} children ages 6-10 and {ages_10_21} children ages 11-21\n')

num_sites = len(df['Site'].unique())
print(f'4. there are {num_sites} study sites\n')

years = df['Enroll_Year'].value_counts().index.astype(str).str.strip('.0').astype(int).tolist()
num_years = len(df['Enroll_Year'].unique())
print(f'5. data were collected across {num_years} years: {years}\n')

num_disorders = len(df['DX_01'].unique())
num_cat = len(df['DX_01_Cat'].unique())
print(f'6. there are {num_disorders} unique disorders, classified under {num_cat} categories\n')

comorbid = round((df['comorbidities'].value_counts() / len(df)) * 100)
num_comorbid = comorbid[1:].sum()
print(f'7. approximately {num_comorbid}% have disorder combordities\n')

disorder = round((df['DX_01'].value_counts() / len(df)) * 100).head(1)
print(f'8. most prevalent diagnosis is {disorder.index[0]} - {disorder.values[0]}% of sample\n')

sex = df_CGAS.groupby(['Sex']).agg({'CGAS_Score': 'mean'})
f_cgas = sex.loc['female'].values[0]
m_cgas = sex.loc['male'].values[0]
print(f'9. females have an average general functioning score of {round(f_cgas)}% and males {round(m_cgas)}%\n')

1. total sample size is 4106

2. there are 2626 males and 1480 females

3. there are 2394 children ages 6-10 and 1712 children ages 11-21

4. there are 5 study sites

5. data were collected across 7 years: [2018, 2019, 2017, 2016, 202, 2015, 2021]

6. there are 71 unique disorders, classified under 17 categories

7. approximately 59.0% have disorder combordities

8. most prevalent diagnosis is ADHD-Combined Type - 18.0% of sample

9. females have an average general functioning score of 66% and males 64%

fig = px.histogram(df, x="Age", nbins=10, color="Sex")
fig.show()

ADHD make up the majority of diagnoses

piechart(dataframe=df_intake, y='DX_01', hue=None)

Majority of samples were collected in study site 1

piechart(dataframe=df, y='Site', hue=None)

Majority of participans were collected between 2017-2019

piechart(dataframe=df_intake, y='Enroll_Year', hue=None)

# vis.wordcloud(dataframe=df, column='DX_01')

piechart(dataframe=df_intake, y='PreInt_Demos_Fam,Child_Race_cat', hue=None)

ADHD is the most diagnosed disorder in the HBN followed by ASD. Most disorders are diagnosed from 5-21 except for hyperactive/impulsive adhd, which isn’t diagnosed past 13-14 years

fig = px.scatter(df, x="Identifiers", y="Age", color="DX_01",
                hover_name="DX_01", log_x=False, size_max=60) # size="Age"
fig.update_layout(showlegend=False)
fig.update_xaxes(showticklabels=False, title='Participants')
fig.show()

fig = px.scatter(df, x="Identifiers", y="Age", color="DX_01_Cat",
                hover_name="DX_01_Cat", log_x=False, size_max=60) # size="Age"
fig.update_layout(showlegend=False)
fig.update_xaxes(showticklabels=False, title='Participants')
fig.show()

there are more than double the females with major depressive disorder (including persistent depressive disorder) than males

diagnosis_grouped = df.groupby(['DX_01', 'DX_01_Cat', 'Sex']).agg({
                'Age': 'mean', 'Identifiers': 'count', 'comorbidities': 'mean'
                }).reset_index()
diagnosis_grouped = diagnosis_grouped.rename({'Identifiers': 'count', 'Age': 'mean_age'}, axis=1)

fig = px.bar(diagnosis_grouped.sort_values(by='count', ascending=False).head(45), 
             x='count', y='DX_01', color='Sex', orientation='h')
fig.update_yaxes(title='')
fig.show()

mood and anxiety disorders are only disorders where females outnumber males

diagnosis_grouped = df.groupby(['DX_01', 'DX_01_Cat', 'Sex']).agg({
                'Age': 'mean', 'Identifiers': 'count', 'comorbidities': 'mean'
                }).reset_index()
diagnosis_grouped = diagnosis_grouped.rename({'Identifiers': 'count', 'Age': 'mean_age'}, axis=1)


fig = px.bar(diagnosis_grouped.groupby(['DX_01_Cat', 'Sex']).sum().reset_index().sort_values(by='count', ascending=False), 
             x='count', y='DX_01_Cat', color='Sex', orientation='h', text='count')
fig.update_yaxes(title='')
fig.show()

disorders with n>150: neurodevelopmental disorders (including autism, adhd, and neurocognitive/intellectual), anxiety, depression, control group (no diagnosis given)

diagnosis_grouped = df.groupby(['DX_01', 'DX_01_Cat', 'Sex']).agg({
                'Age': 'mean', 'Identifiers': 'count', 'comorbidities': 'mean'
                }).reset_index()
diagnosis_grouped = diagnosis_grouped.rename({'Identifiers': 'count', 'Age': 'mean_age'}, axis=1)


fig = px.bar(diagnosis_grouped.groupby(['DX_01_Cat']).sum().reset_index().sort_values(by='count', ascending=False), 
             x='count', y='DX_01_Cat',  orientation='h', text='count')
fig.update_yaxes(title='')
fig.show()

59% of participants have a comorbidity

fig = px.histogram(df, x="comorbidities", nbins=10, color="Sex")
fig.show()

psychosis disorders have most comorbidities (and females > males)

diagnosis_grouped = df.groupby(['DX_01', 'DX_01_Cat', 'Sex']).agg({
                'Age': 'mean', 'Identifiers': 'count', 'comorbidities': 'mean'
                }).reset_index()
diagnosis_grouped = diagnosis_grouped[diagnosis_grouped['comorbidities']>0]
diagnosis_grouped = diagnosis_grouped.rename({'Identifiers': 'count', 'Age': 'mean_age'}, axis=1)


fig = px.bar(diagnosis_grouped.sort_values(by='comorbidities', ascending=False).head(20), 
             x='comorbidities', y='DX_01', color='Sex', orientation='h', barmode="group")
fig.update_xaxes(range=[2,6])
fig.update_yaxes(title='')
fig.show()

Majority of sample have a general functioning score between 60-70%

fig = px.histogram(df_CGAS, x="CGAS_Score",  nbins=10)
fig.show()

Across broad categories, children on the schizophrenia spectrum have the lowest general functioning

diagnosis_grouped = df_CGAS.groupby(['DX_01_Cat']).agg({
                'Age': 'mean', 'Identifiers': 'count', 'CGAS_Score': 'mean'
                }).reset_index()
diagnosis_grouped = diagnosis_grouped.rename({'Identifiers': 'count', 'Age': 'mean_age'}, axis=1)


fig = px.bar(diagnosis_grouped, 
             x='CGAS_Score', y='DX_01_Cat', orientation='h')
fig.update_xaxes(range=[35,80])
fig.update_yaxes(title='')
fig.show()

Most children are from the U.S.

piechart(dataframe=df_intake, y='PreInt_Demos_Fam,Child_CountryOrigin', hue=None)


diagnosis_grouped = df_intake.groupby(['DX_01', 'Sex']).agg({
                'Age': 'mean', 'Identifiers': 'count', 'PreInt_DevHx,preg_dur': 'mean'
                }).reset_index()
diagnosis_grouped = diagnosis_grouped.rename({'Identifiers': 'count', 'Age': 'mean_age'}, axis=1)


fig = px.bar(diagnosis_grouped.groupby(['DX_01', 'Sex']).sum().reset_index().sort_values(by='count', ascending=False).head(20), 
             x='PreInt_DevHx,preg_dur', y='DX_01', color='Sex', orientation='h', barmode="group")
fig.update_yaxes(title='')
fig.update_xaxes(range=[30,40])
fig.show()

most adolescents start menstruation between 11-12 years old - girls with ADHD-hyperactive/impulsive type start earliest at 10 years of age

diagnosis_grouped = df_intake.groupby(['DX_01']).agg({
                'Age': 'mean', 'Identifiers': 'count', 'PreInt_DevHx,menstruation_age': 'mean'
                }).reset_index()
diagnosis_grouped = diagnosis_grouped.rename({'Identifiers': 'count', 'Age': 'mean_age'}, axis=1)


fig = px.bar(diagnosis_grouped.groupby(['DX_01']).sum().reset_index().sort_values(by='count', ascending=False).head(20), 
             x='PreInt_DevHx,menstruation_age', y='DX_01',  orientation='h', barmode="group")
fig.update_yaxes(title='')
fig.update_xaxes(range=[8,13])
fig.show()

On average, children had a normal birthweight

diagnosis_grouped = df_intake.groupby(['DX_01']).agg({
                'Age': 'mean', 'Identifiers': 'count', 'PreInt_DevHx,birthweight_ozs': 'mean'
                }).reset_index()
diagnosis_grouped = diagnosis_grouped.rename({'Identifiers': 'count', 'Age': 'mean_age'}, axis=1)


fig = px.bar(diagnosis_grouped.groupby(['DX_01']).sum().reset_index().sort_values(by='count', ascending=False).head(20), 
             x='PreInt_DevHx,birthweight_ozs', y='DX_01',  orientation='h', barmode="group")
fig.update_yaxes(title='')
fig.update_xaxes(range=[4,8])
fig.show()

fig = px.box(df_intake, x="DX_01", y="PreInt_DevHx,puberty_age", 
                hover_name="DX_01") # size="Age"
fig.update_layout(showlegend=False)
fig.update_xaxes(showticklabels=False, title='Disorders')
fig.show()


premature = df_intake[df_intake["PreInt_DevHx,premature"]==1].groupby('DX_01').agg({"PreInt_DevHx,premature": 'count'})
total = df_intake.groupby('DX_01').agg({"PreInt_DevHx,premature": 'count'})

percent = premature / total

piechart(dataframe=percent, y="PreInt_DevHx,premature", hue="DX_01")