Majority of sample (n=4106) is male (n=2626) and majority of sample are in the age range: 6-10 (n=2394)
import seaborn as snsimport as pximport matplotlib.pyplot as pltimport numpy as npimport umapimport as piofrom import make_datasetfrom hbn.features.build_features import get_featuresfrom hbn.visualization import visualize as vis%load_ext autoreload%autoreload 2import warningswarnings.filterwarnings('ignore')# pio.renderers.default = 'iframe'
## Functionsdef piechart(dataframe, y='PreInt_DevHx,birthweight_lbs', hue='DX_01'):if hue: df_grouped = dataframe.groupby(hue).agg({y: 'mean'}).reset_index() fig = px.pie(df_grouped, names=hue, values=y)else: fig = px.pie(dataframe, names=y) fig.update_traces(textposition='inside', textinfo='percent+label') race(x): race_dict = {0: "White/Caucasian",1:"Black/African American",2:"Hispanic",3:"Asian",4:"Indian",5:"Native American Indian",6:"American Indian/Alaskan Native",7:"Native Hawaiian/Other Pacific Islander",8:"Two or more races",9:"Other race",10:"Unknown",11:"Choose not to specify" }return race_dict[x]
reading /Users/maedbhking/Documents/healthy_brain_network/data/raw/phenotype/Parent_Measures/Interview_of_Emotional_and_Psychological_Function/Intake_Interview.csv into dataframe
## summary statssample_size =len(df['Identifiers'].unique())print(f'1. total sample size is {sample_size}\n')sex = df['Sex'].value_counts()print(f'2. there are {sex.male} males and {sex.female} females\n')ages_6_10 =len(df[df['Age'].round()<=10])ages_10_21 =len(df[df['Age'].round()>10])print(f'3. there are {ages_6_10} children ages 6-10 and {ages_10_21} children ages 11-21\n')num_sites =len(df['Site'].unique())print(f'4. there are {num_sites} study sites\n')years = df['Enroll_Year'].value_counts().index.astype(str).str.strip('.0').astype(int).tolist()num_years =len(df['Enroll_Year'].unique())print(f'5. data were collected across {num_years} years: {years}\n')num_disorders =len(df['DX_01'].unique())num_cat =len(df['DX_01_Cat'].unique())print(f'6. there are {num_disorders} unique disorders, classified under {num_cat} categories\n')comorbid =round((df['comorbidities'].value_counts() /len(df)) *100)num_comorbid = comorbid[1:].sum()print(f'7. approximately {num_comorbid}% have disorder combordities\n')disorder =round((df['DX_01'].value_counts() /len(df)) *100).head(1)print(f'8. most prevalent diagnosis is {disorder.index[0]} - {disorder.values[0]}% of sample\n')sex = df_CGAS.groupby(['Sex']).agg({'CGAS_Score': 'mean'})f_cgas = sex.loc['female'].values[0]m_cgas = sex.loc['male'].values[0]print(f'9. females have an average general functioning score of {round(f_cgas)}% and males {round(m_cgas)}%\n')
1. total sample size is 4106
2. there are 2626 males and 1480 females
3. there are 2394 children ages 6-10 and 1712 children ages 11-21
4. there are 5 study sites
5. data were collected across 7 years: [2018, 2019, 2017, 2016, 202, 2015, 2021]
6. there are 71 unique disorders, classified under 17 categories
7. approximately 59.0% have disorder combordities
8. most prevalent diagnosis is ADHD-Combined Type - 18.0% of sample
9. females have an average general functioning score of 66% and males 64%
ADHD is the most diagnosed disorder in the HBN followed by ASD. Most disorders are diagnosed from 5-21 except for hyperactive/impulsive adhd, which isn’t diagnosed past 13-14 years
disorders with n>150: neurodevelopmental disorders (including autism, adhd, and neurocognitive/intellectual), anxiety, depression, control group (no diagnosis given)