Topic Modeling with BERT

%matplotlib inline 

# standard libraries
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import plotly.offline as pyo
pyo.init_notebook_mode()

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Leveraging BERT and TF-IDF to create easily interpretable topics.

See https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6 for a great tutorial (code in this notebook comes from the tutorial)

from hbn.models.topic_modeling import *

# load data
# dataframe contains sentences for all HBN clinical questionnaires

#data_dir = '/Users/maedbhking/Documents/healthy_brain_network/data/raw/phenotype'
data_dir = '/Users/maedbhking/Documents/hbn_data'
fname = 'item-names-cleaned.csv'

df = pd.read_csv(os.path.join(data_dir, fname))

# get sentences to be input to topic modeling routine
data = df['questions'].dropna().tolist()
idx = df['questions'].isna()
df_dict = df[~idx].reset_index(drop=True)

# save directory
save_dir = '/Users/maedbhking/Downloads/'

Quick fix changes to domain names in data dictionary

  • one of the domain names for Parent Measures is missing in https://docs.google.com/spreadsheets/d/1sGb3ECGR47BzIWNZwzh4ARrjFaf5ByVA/edit#gid=1593310919
  • therefore, we are assigning all measures corresponding to this missing domain as Symptoms
## quick fix to missing Parent domain names ##
measures_to_change = ['SympChck', 'ICU_P', 'ARI_P', 'SRS_Pre', 'SRS', 'RBS', 'SDQ', 'WHODAS_P', 'SAS', 
                'CIS_P', 'SCQ', 'ASSQ', 'SWAN','ESWAN','SCARED_P','MFQ_P', 'CBCL', 'CBCL_Pre']
for abbrev in df_dict['datadic'].unique():
    if abbrev in measures_to_change:
        df_dict.loc[df_dict["datadic"]==abbrev, "domains"] = 'Symptoms'
## quick fix to make domain names more palatable for visualization ##

remap = {'Questionnaire_Measures_of_Family_Structure_Stress_and_Trauma': 'Trauma',
        'Demographic_Questionnaire_Measures': 'Demographics',
        'Questionnaire_Measures_of_Emotional_and_Cognitive_Status': 'Emotional Status',
        'Physical_Fitness_and_Status': 'Fitness',
        'Language_Tasks': 'Language',
        'Vision': 'Vision',
        'Medical_Status_Measures': 'Medical History',
        'Neurologic_Function': 'MRI/EEG',
        'Questionnaire_Measures_of_Substance_Use_&_Addiction': 'Substance Use',
        'Cognitive_Testing': 'Cognition',
        'Interview_of_Emotional_and_Psychological_Function': 'Psychological Function',
        'Motor_Skills': 'Motor Skills',
        'Physiologic_Function': 'Physiology',
         'Questionnaire_Measures_of_Family_structure_Stress_and_Trauma': 'Trauma',
         'Questionnaire_Measures_of_Family_structure_Stress_and_Trauma': 'Trauma',
         'Symptoms': 'Symptoms'
        }

for domain in df_dict['domains'].unique():
    if domain in remap.keys():
        df_dict.loc[df_dict['domains']==domain, 'domains'] = remap[domain]

Embeddings

The very first step we have to do is converting the documents to numerical data. We use BERT for this purpose as it extracts different embeddings based on the context of the word. Not only that, there are many pre-trained models available ready to be used.

How you generate the BERT embeddings for a document is up to you. However, I prefer to use the sentence-transformers package as the resulting embeddings have shown to be of high quality and typically work quite well for document-level embeddings.

We are using Distilbert as it gives a nice balance between speed and performance. The package has several multi-lingual models available for you to use.

NOTE: Since transformer models have a token limit, you might run into some errors when inputting large documents. In that case, you could consider splitting documents into paragraphs.

# get embeddings
embeddings = get_embeddings(data, transformer='all-MiniLM-L6-v2') # 'distilbert-base-nli-mean-tokens'
Batches:   0%|          | 0/191 [00:00<?, ?it/s]Batches:   1%|          | 1/191 [00:01<03:46,  1.19s/it]Batches:   1%|          | 2/191 [00:01<02:00,  1.57it/s]Batches:   2%|▏         | 3/191 [00:01<01:23,  2.26it/s]Batches:   2%|▏         | 4/191 [00:01<01:04,  2.91it/s]Batches:   3%|▎         | 5/191 [00:02<00:52,  3.51it/s]Batches:   3%|▎         | 6/191 [00:02<00:45,  4.04it/s]Batches:   4%|▎         | 7/191 [00:02<00:39,  4.61it/s]Batches:   4%|▍         | 8/191 [00:02<00:36,  5.08it/s]Batches:   5%|▍         | 9/191 [00:02<00:33,  5.41it/s]Batches:   5%|▌         | 10/191 [00:02<00:31,  5.82it/s]Batches:   6%|▌         | 11/191 [00:02<00:28,  6.25it/s]Batches:   6%|▋         | 12/191 [00:03<00:27,  6.40it/s]Batches:   7%|▋         | 13/191 [00:03<00:28,  6.18it/s]Batches:   7%|▋         | 14/191 [00:03<00:26,  6.62it/s]Batches:   8%|▊         | 15/191 [00:03<00:26,  6.53it/s]Batches:   8%|▊         | 16/191 [00:03<00:26,  6.63it/s]Batches:   9%|▉         | 17/191 [00:03<00:25,  6.75it/s]Batches:   9%|▉         | 18/191 [00:03<00:26,  6.60it/s]Batches:  10%|▉         | 19/191 [00:04<00:26,  6.54it/s]Batches:  10%|█         | 20/191 [00:04<00:25,  6.61it/s]Batches:  11%|█         | 21/191 [00:04<00:25,  6.70it/s]Batches:  12%|█▏        | 22/191 [00:04<00:25,  6.69it/s]Batches:  12%|█▏        | 23/191 [00:04<00:25,  6.70it/s]Batches:  13%|█▎        | 24/191 [00:04<00:24,  6.78it/s]Batches:  13%|█▎        | 25/191 [00:05<00:24,  6.79it/s]Batches:  14%|█▎        | 26/191 [00:05<00:22,  7.33it/s]Batches:  14%|█▍        | 27/191 [00:05<00:22,  7.19it/s]Batches:  15%|█▍        | 28/191 [00:05<00:22,  7.10it/s]Batches:  15%|█▌        | 29/191 [00:05<00:23,  6.99it/s]Batches:  16%|█▌        | 30/191 [00:05<00:21,  7.42it/s]Batches:  16%|█▌        | 31/191 [00:05<00:22,  7.19it/s]Batches:  17%|█▋        | 32/191 [00:05<00:21,  7.40it/s]Batches:  17%|█▋        | 33/191 [00:06<00:21,  7.49it/s]Batches:  18%|█▊        | 34/191 [00:06<00:20,  7.57it/s]Batches:  18%|█▊        | 35/191 [00:06<00:19,  7.96it/s]Batches:  19%|█▉        | 36/191 [00:06<00:20,  7.73it/s]Batches:  19%|█▉        | 37/191 [00:06<00:19,  8.02it/s]Batches:  20%|█▉        | 38/191 [00:06<00:18,  8.44it/s]Batches:  20%|██        | 39/191 [00:06<00:17,  8.74it/s]Batches:  21%|██        | 40/191 [00:06<00:17,  8.71it/s]Batches:  21%|██▏       | 41/191 [00:07<00:16,  8.98it/s]Batches:  22%|██▏       | 42/191 [00:07<00:17,  8.65it/s]Batches:  23%|██▎       | 43/191 [00:07<00:17,  8.49it/s]Batches:  23%|██▎       | 44/191 [00:07<00:17,  8.55it/s]Batches:  24%|██▎       | 45/191 [00:07<00:16,  8.87it/s]Batches:  24%|██▍       | 46/191 [00:07<00:15,  9.09it/s]Batches:  25%|██▍       | 47/191 [00:07<00:16,  8.83it/s]Batches:  26%|██▌       | 49/191 [00:07<00:15,  9.16it/s]Batches:  27%|██▋       | 51/191 [00:08<00:15,  9.08it/s]Batches:  28%|██▊       | 53/191 [00:08<00:15,  9.19it/s]Batches:  28%|██▊       | 54/191 [00:08<00:14,  9.34it/s]Batches:  29%|██▉       | 55/191 [00:08<00:14,  9.24it/s]Batches:  29%|██▉       | 56/191 [00:08<00:14,  9.28it/s]Batches:  30%|██▉       | 57/191 [00:08<00:15,  8.90it/s]Batches:  31%|███       | 59/191 [00:08<00:13,  9.51it/s]Batches:  32%|███▏      | 61/191 [00:09<00:13,  9.88it/s]Batches:  33%|███▎      | 63/191 [00:09<00:12, 10.14it/s]Batches:  34%|███▍      | 65/191 [00:09<00:12, 10.28it/s]Batches:  35%|███▌      | 67/191 [00:09<00:12, 10.22it/s]Batches:  36%|███▌      | 69/191 [00:09<00:11, 10.34it/s]Batches:  37%|███▋      | 71/191 [00:10<00:11, 10.31it/s]Batches:  38%|███▊      | 73/191 [00:10<00:11, 10.16it/s]Batches:  39%|███▉      | 75/191 [00:10<00:11, 10.26it/s]Batches:  40%|████      | 77/191 [00:10<00:10, 10.67it/s]Batches:  41%|████▏     | 79/191 [00:10<00:10, 10.98it/s]Batches:  42%|████▏     | 81/191 [00:11<00:10, 10.99it/s]Batches:  43%|████▎     | 83/191 [00:11<00:09, 11.17it/s]Batches:  45%|████▍     | 85/191 [00:11<00:09, 11.08it/s]Batches:  46%|████▌     | 87/191 [00:11<00:09, 11.04it/s]Batches:  47%|████▋     | 89/191 [00:11<00:09, 11.32it/s]Batches:  48%|████▊     | 91/191 [00:11<00:08, 11.27it/s]Batches:  49%|████▊     | 93/191 [00:12<00:08, 11.61it/s]Batches:  50%|████▉     | 95/191 [00:12<00:08, 11.67it/s]Batches:  51%|█████     | 97/191 [00:12<00:08, 11.61it/s]Batches:  52%|█████▏    | 99/191 [00:12<00:07, 11.80it/s]Batches:  53%|█████▎    | 101/191 [00:12<00:07, 12.00it/s]Batches:  54%|█████▍    | 103/191 [00:12<00:07, 12.33it/s]Batches:  55%|█████▍    | 105/191 [00:13<00:07, 12.21it/s]Batches:  56%|█████▌    | 107/191 [00:13<00:06, 12.45it/s]Batches:  57%|█████▋    | 109/191 [00:13<00:06, 12.59it/s]Batches:  58%|█████▊    | 111/191 [00:13<00:06, 12.53it/s]Batches:  59%|█████▉    | 113/191 [00:13<00:06, 12.17it/s]Batches:  60%|██████    | 115/191 [00:13<00:06, 12.40it/s]Batches:  61%|██████▏   | 117/191 [00:14<00:06, 12.26it/s]Batches:  62%|██████▏   | 119/191 [00:14<00:05, 12.50it/s]Batches:  63%|██████▎   | 121/191 [00:14<00:05, 13.02it/s]Batches:  64%|██████▍   | 123/191 [00:14<00:05, 13.41it/s]Batches:  65%|██████▌   | 125/191 [00:14<00:04, 13.50it/s]Batches:  66%|██████▋   | 127/191 [00:14<00:04, 13.78it/s]Batches:  68%|██████▊   | 129/191 [00:14<00:04, 13.39it/s]Batches:  69%|██████▊   | 131/191 [00:15<00:04, 13.25it/s]Batches:  70%|██████▉   | 133/191 [00:15<00:04, 13.72it/s]Batches:  71%|███████   | 135/191 [00:15<00:03, 14.09it/s]Batches:  72%|███████▏  | 137/191 [00:15<00:03, 14.11it/s]Batches:  73%|███████▎  | 139/191 [00:15<00:03, 14.17it/s]Batches:  74%|███████▍  | 141/191 [00:15<00:03, 13.95it/s]Batches:  75%|███████▍  | 143/191 [00:15<00:03, 13.66it/s]Batches:  76%|███████▌  | 145/191 [00:16<00:03, 13.34it/s]Batches:  77%|███████▋  | 147/191 [00:16<00:03, 13.70it/s]Batches:  78%|███████▊  | 149/191 [00:16<00:03, 13.90it/s]Batches:  79%|███████▉  | 151/191 [00:16<00:02, 14.34it/s]Batches:  80%|████████  | 153/191 [00:16<00:02, 14.18it/s]Batches:  81%|████████  | 155/191 [00:16<00:02, 14.11it/s]Batches:  82%|████████▏ | 157/191 [00:16<00:02, 14.77it/s]Batches:  83%|████████▎ | 159/191 [00:17<00:02, 15.02it/s]Batches:  84%|████████▍ | 161/191 [00:17<00:01, 15.77it/s]Batches:  85%|████████▌ | 163/191 [00:17<00:01, 15.51it/s]Batches:  86%|████████▋ | 165/191 [00:17<00:01, 15.23it/s]Batches:  87%|████████▋ | 167/191 [00:17<00:01, 15.68it/s]Batches:  88%|████████▊ | 169/191 [00:17<00:01, 16.21it/s]Batches:  90%|████████▉ | 171/191 [00:17<00:01, 16.13it/s]Batches:  91%|█████████ | 173/191 [00:17<00:01, 16.06it/s]Batches:  92%|█████████▏| 175/191 [00:18<00:00, 16.61it/s]Batches:  93%|█████████▎| 177/191 [00:18<00:00, 17.07it/s]Batches:  94%|█████████▎| 179/191 [00:18<00:00, 17.46it/s]Batches:  95%|█████████▍| 181/191 [00:18<00:00, 17.72it/s]Batches:  96%|█████████▌| 183/191 [00:18<00:00, 17.95it/s]Batches:  97%|█████████▋| 185/191 [00:18<00:00, 17.56it/s]Batches:  98%|█████████▊| 187/191 [00:18<00:00, 17.86it/s]Batches:  99%|█████████▉| 189/191 [00:18<00:00, 18.19it/s]Batches: 100%|██████████| 191/191 [00:18<00:00, 10.13it/s]

Clustering

We want to make sure that documents with similar topics are clustered together such that we can find the topics within these clusters. Before doing so, we first need to lower the dimensionality of the embeddings as many clustering algorithms handle high dimensionality poorly.

UMAP

Out of the few dimensionality reduction algorithms, UMAP is arguably the best performing as it keeps a significant portion of the high-dimensional local structure in lower dimensionality.

We use the package umap-learn before we lower the dimensionality of the document embeddings. We reduce the dimensionality to 5 while keeping the size of the local neighborhood at 15. You can play around with these values to optimize for your topic creation. Note that a too low dimensionality results in a loss of information while a too high dimensionality results in poorer clustering results.

# get umap embeddings
umap_embeddings = dimensionality_reduction(embeddings)

HDBSCAN

After having reduced the dimensionality of the documents embeddings to 5, we can cluster the documents with the hdbscan package. HDBSCAN is a density-based algorithm that works quite well with UMAP since UMAP maintains a lot of local structure even in lower-dimensional space. Moreover, HDBSCAN does not force data points to clusters as it considers them outliers.

# get clusters
cluster = clustering(umap_embeddings, min_cluster_size=30)

Great! We now have clustered similar documents together which should represent the topics that they consist of. To visualize the resulting clusters we can further reduce the dimensionality to 2 and visualize the outliers as grey points:

plotting_style()

# visualize clusters
visualize_clusters(embeddings, cluster, n_neighbors=18)

It is difficult to visualize the individual clusters due to the number of topics generated. However, we can see that even in 2-dimensional space some local structure is kept.

NOTE: You could skip the dimensionality reduction step if you use a clustering algorithm that can handle high dimensionality like a cosine-based k-Means.

Topic Creation

What we want to know from the clusters that we generated, is what makes one cluster, based on their content, different from another?

How can we derive topics from clustered documents?

To solve this, I came up with a class-based variant of TF-IDF (c-TF-IDF), that would allow me to extract what makes each set of documents unique compared to the other.

The intuition behind the method is as follows. When you apply TF-IDF as usual on a set of documents, what you are basically doing is comparing the importance of words between documents.

What if, we instead treat all documents in a single category (e.g., a cluster) as a single document and then apply TF-IDF? The result would be a very long document per category and the resulting TF-IDF score would demonstrate the important words in a topic.

c-TF-IDF

To create this class-based TF-IDF score, we need to first create a single document for each cluster of documents:

# get docs_per_topic
docs_per_topic, docs_df = tf_idf(data, cluster)

Then, we apply the class-based TF-IDF by joining documents within a class. Where the frequency of each word t is extracted for each class i and divided by the total number of words w. This action can be seen as a form of regularization of frequent words in the class. Next, the total, unjoined, number of documents m is divided by the total frequency of word t across all classes n.

# get idf scores
tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(data))

Now, we have a single importance value for each word in a cluster which can be used to create the topic. If we take the top 10 most important words in each cluster, then we would get a good representation of a cluster, and thereby a topic.

In order to create a topic representation, we take the top 20 words per topic based on their c-TF-IDF scores. The higher the score, the more representative it should be of its topic as the score is a proxy of information density.

# create topic representation
top_n_words = extract_top_n_words_per_topic(tf_idf, count, docs_per_topic, n=20)
topic_sizes = extract_topic_sizes(docs_df); 

We can use topic_sizes to view how frequent certain topics are:

print(topic_sizes.head(10))
    Topic  Size
0      -1  1520
27     26  1253
31     30   468
8       7   370
7       6   337
2       1   291
20     19   205
24     23   124
23     22   115
13     12   113

The topic name-1 refers to all documents that did not have any topics assigned. The great thing about HDBSCAN is that not all documents are forced towards a certain cluster. If no cluster could be found, then it is simply an outlier.

We can see that topics 28,6,43 are the largest clusters that we could create. To view the words belonging to those topics, we can simply use the dictionarytop_n_words to access these topics:

# print(top_n_words[31][:10])

# print(top_n_words[27][:10])

# print(top_n_words[7][:10])

# print(top_n_words[3][:10])

Looking at the largest four topics, I would say that these nicely seem to represent easily interpretable topics!

Visualization

Put everything together and visualize the clusters + their corresponding topic

# create topic data to figure out which topics are assigned to each sentence 
docs_topic = docs_df.merge(topic_sizes, on=['Topic'])
topic_data = pd.concat([df_dict, docs_topic], axis=1)

df_all = visualize_clusters_interactive(embeddings=embeddings, 
                                        topic_data=topic_data,
                                        hover_data=['measures', 'questions', 'domains', 'Topic'], # 'Topic', 'datadic',
                                        cluster=cluster, 
                                        n_neighbors=20,
                                        min_dist=0.8,
                                        n_components=2
                                       )

How are measures assigned to topics/clusters?

measures = df_all['datadic'].unique()

df_concat = pd.DataFrame()
for metric in ['labels', 'Topic']:
    count_all = []
    for measure in measures:
        count = df_all[(df_all['datadic']==measure) & (df_all['labels']!=-1) & (df_all['Topic']!=-1)][metric].unique()
        count_all.append(len(count))

    df = pd.DataFrame({'count': np.array(count_all),
                        'measures': measures,
                        'metric': np.tile(metric, len(count_all))})

    df_concat = pd.concat([df_concat, df])

metric='labels'
print(f'assigning measures to {metric} \n')
print('the following measures are considered outliers and are not assigned to clusters: \n')
print(df_concat[df_concat['count']==0]['measures'].unique())

print('\n')
      
print('the following measures are assigned to many clusters (most - least): \n')
tmp = df_concat[
    (df_concat['metric']==metric) & (df_concat['count']!=0)].sort_values(
    by='count', ascending=False)['measures'].head(10).tolist()
print(tmp)
assigning measures to labels 

the following measures are considered outliers and are not assigned to clusters: 

['ACE' 'ACE_P' 'APQ_P' 'APQ_SR' 'ARI_P' 'ARI_S' 'ASR' 'ASSQ' 'AUDIT'
 'Barratt' 'BasicDemo' 'BIA' 'C3SR' 'CAARS' 'CBCL' 'CBCL_Pre' 'CCSC'
 'CELF' 'CELF_Full_9to21' 'CELF_Full_5to8' 'CELF_Meta' 'CFS' 'CGAS'
 'CIS_P' 'CIS_SR' 'ColorVision' 'ConsensusD' 'CPIC' 'CSSRS' 'CTOPP'
 'DailyMed']


the following measures are assigned to many clusters (most - least): 

['PreInt_DevH', 'PreInt_FamH', 'Diagnosis_KSADS', 'YSR', 'TRF', 'TRF_P', 'SDQ', 'PBQ', 'PreInt_FamHx_RDC', 'PMHS']

Topic Reduction

There is a chance that, depending on the dataset, you will get hundreds of topics that were created! You can tweak the parameters of HDBSCAN such that you will get fewer topics through its min_cluster_size parameter but it does not allow you to specify the exact number of clusters.

A nifty trick that Top2Vec was using is the ability to reduce the number of topics by merging the topic vectors that were most similar to each other.

We can use a similar technique by comparing the c-TF-IDF vectors among topics, merge the most similar ones, and finally re-calculate the c-TF-IDF vectors to update the representation of our topics:

# topic reduction

docs_df_adj, top_n_words_adj = topic_reduction(data, docs_df, tf_idf)
topic_sizes_adj = extract_topic_sizes(docs_df_adj);

Above, we took the least common topic and merged it with the most similar topic. By repeating this 19 more times we reduced the number of topics from 56 to 36!

NOTE: We can skip the re-calculation part of this pipeline to speed up the topic reduction step. However, it is more accurate to re-calculate the c-TF-IDF vectors as that would better represent the newly generated content of the topics. You can play around with this by, for example, update every n steps to both speed-up the process and still have good topic representations.

TIP: You can use the method described in this article (or simply use BERTopic) to also create sentence-level embeddings. The main advantage of this is the possibility to view the distribution of topics within a single document.

# print(top_n_words_adj[31][:10])

# print(top_n_words_adj[2][:10])

# print(top_n_words_adj[29][:10])

# print(top_n_words_adj[31][:10])

Topic Modeling with BERTopic

Code taken from this tutorial: https://python.plainenglish.io/topic-modeling-for-beginners-using-bertopic-and-python-aaf1b421afeb. Github here: https://github.com/MaartenGr/BERTopic

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,    # Step 1 - Extract embeddings
  umap_model=umap_model,              # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
#   n_gram_range=(1, 2),
  nr_topics=30,
  min_topic_size=15,                   
  calculate_probabilities=True,        
  verbose=True
)

topics, probs = topic_model.fit_transform(data)
topic_model.get_topic_info()
Batches:   0%|          | 0/191 [00:00<?, ?it/s]Batches:   1%|          | 1/191 [00:01<03:38,  1.15s/it]Batches:   1%|          | 2/191 [00:01<01:57,  1.61it/s]Batches:   2%|▏         | 3/191 [00:01<01:22,  2.28it/s]Batches:   2%|▏         | 4/191 [00:01<01:03,  2.94it/s]Batches:   3%|▎         | 5/191 [00:01<00:53,  3.50it/s]Batches:   3%|▎         | 6/191 [00:02<00:46,  4.00it/s]Batches:   4%|▎         | 7/191 [00:02<00:40,  4.53it/s]Batches:   4%|▍         | 8/191 [00:02<00:36,  4.97it/s]Batches:   5%|▍         | 9/191 [00:02<00:33,  5.45it/s]Batches:   5%|▌         | 10/191 [00:02<00:30,  5.84it/s]Batches:   6%|▌         | 11/191 [00:02<00:29,  6.10it/s]Batches:   6%|▋         | 12/191 [00:03<00:28,  6.37it/s]Batches:   7%|▋         | 13/191 [00:03<00:28,  6.29it/s]Batches:   7%|▋         | 14/191 [00:03<00:26,  6.68it/s]Batches:   8%|▊         | 15/191 [00:03<00:26,  6.62it/s]Batches:   8%|▊         | 16/191 [00:03<00:25,  6.73it/s]Batches:   9%|▉         | 17/191 [00:03<00:25,  6.74it/s]Batches:   9%|▉         | 18/191 [00:03<00:26,  6.62it/s]Batches:  10%|▉         | 19/191 [00:04<00:26,  6.56it/s]Batches:  10%|█         | 20/191 [00:04<00:25,  6.59it/s]Batches:  11%|█         | 21/191 [00:04<00:25,  6.69it/s]Batches:  12%|█▏        | 22/191 [00:04<00:25,  6.74it/s]Batches:  12%|█▏        | 23/191 [00:04<00:25,  6.72it/s]Batches:  13%|█▎        | 24/191 [00:04<00:24,  6.70it/s]Batches:  13%|█▎        | 25/191 [00:05<00:24,  6.70it/s]Batches:  14%|█▎        | 26/191 [00:05<00:22,  7.18it/s]Batches:  14%|█▍        | 27/191 [00:05<00:23,  7.06it/s]Batches:  15%|█▍        | 28/191 [00:05<00:23,  6.97it/s]Batches:  15%|█▌        | 29/191 [00:05<00:23,  6.93it/s]Batches:  16%|█▌        | 30/191 [00:05<00:22,  7.06it/s]Batches:  16%|█▌        | 31/191 [00:05<00:22,  7.00it/s]Batches:  17%|█▋        | 32/191 [00:05<00:21,  7.39it/s]Batches:  17%|█▋        | 33/191 [00:06<00:21,  7.42it/s]Batches:  18%|█▊        | 34/191 [00:06<00:28,  5.55it/s]Batches:  18%|█▊        | 35/191 [00:06<00:26,  5.95it/s]Batches:  19%|█▉        | 36/191 [00:06<00:25,  5.98it/s]Batches:  19%|█▉        | 37/191 [00:06<00:26,  5.91it/s]Batches:  20%|█▉        | 38/191 [00:07<00:25,  6.10it/s]Batches:  20%|██        | 39/191 [00:07<00:22,  6.68it/s]Batches:  21%|██        | 40/191 [00:07<00:21,  7.16it/s]Batches:  21%|██▏       | 41/191 [00:07<00:19,  7.76it/s]Batches:  22%|██▏       | 42/191 [00:07<00:20,  7.20it/s]Batches:  23%|██▎       | 43/191 [00:07<00:21,  6.97it/s]Batches:  23%|██▎       | 44/191 [00:07<00:20,  7.11it/s]Batches:  24%|██▎       | 45/191 [00:07<00:19,  7.31it/s]Batches:  24%|██▍       | 46/191 [00:08<00:19,  7.53it/s]Batches:  25%|██▍       | 47/191 [00:08<00:19,  7.34it/s]Batches:  25%|██▌       | 48/191 [00:08<00:20,  6.84it/s]Batches:  26%|██▌       | 49/191 [00:08<00:20,  7.01it/s]Batches:  26%|██▌       | 50/191 [00:08<00:18,  7.43it/s]Batches:  27%|██▋       | 51/191 [00:08<00:19,  7.04it/s]Batches:  27%|██▋       | 52/191 [00:08<00:18,  7.69it/s]Batches:  28%|██▊       | 53/191 [00:09<00:18,  7.32it/s]Batches:  28%|██▊       | 54/191 [00:09<00:17,  7.85it/s]Batches:  29%|██▉       | 55/191 [00:09<00:16,  8.22it/s]Batches:  29%|██▉       | 56/191 [00:09<00:15,  8.44it/s]Batches:  30%|██▉       | 57/191 [00:09<00:15,  8.48it/s]Batches:  30%|███       | 58/191 [00:09<00:15,  8.76it/s]Batches:  31%|███       | 59/191 [00:09<00:14,  8.95it/s]Batches:  32%|███▏      | 61/191 [00:09<00:13,  9.60it/s]Batches:  33%|███▎      | 63/191 [00:10<00:12,  9.94it/s]Batches:  34%|███▍      | 65/191 [00:10<00:12, 10.23it/s]Batches:  35%|███▌      | 67/191 [00:10<00:11, 10.35it/s]Batches:  36%|███▌      | 69/191 [00:10<00:11, 10.19it/s]Batches:  37%|███▋      | 71/191 [00:10<00:11, 10.09it/s]Batches:  38%|███▊      | 73/191 [00:11<00:14,  8.29it/s]Batches:  39%|███▊      | 74/191 [00:11<00:13,  8.52it/s]Batches:  39%|███▉      | 75/191 [00:11<00:13,  8.69it/s]Batches:  40%|████      | 77/191 [00:11<00:12,  9.32it/s]Batches:  41%|████▏     | 79/191 [00:11<00:11,  9.91it/s]Batches:  42%|████▏     | 81/191 [00:11<00:11,  9.90it/s]Batches:  43%|████▎     | 83/191 [00:12<00:10, 10.46it/s]Batches:  45%|████▍     | 85/191 [00:12<00:10, 10.33it/s]Batches:  46%|████▌     | 87/191 [00:12<00:09, 10.53it/s]Batches:  47%|████▋     | 89/191 [00:12<00:09, 10.77it/s]Batches:  48%|████▊     | 91/191 [00:12<00:09, 11.09it/s]Batches:  49%|████▊     | 93/191 [00:13<00:08, 11.49it/s]Batches:  50%|████▉     | 95/191 [00:13<00:08, 11.48it/s]Batches:  51%|█████     | 97/191 [00:13<00:07, 11.83it/s]Batches:  52%|█████▏    | 99/191 [00:13<00:07, 11.96it/s]Batches:  53%|█████▎    | 101/191 [00:13<00:07, 12.06it/s]Batches:  54%|█████▍    | 103/191 [00:13<00:07, 12.02it/s]Batches:  55%|█████▍    | 105/191 [00:14<00:07, 11.97it/s]Batches:  56%|█████▌    | 107/191 [00:14<00:06, 12.12it/s]Batches:  57%|█████▋    | 109/191 [00:14<00:06, 11.91it/s]Batches:  58%|█████▊    | 111/191 [00:14<00:06, 11.99it/s]Batches:  59%|█████▉    | 113/191 [00:14<00:06, 11.95it/s]Batches:  60%|██████    | 115/191 [00:14<00:06, 12.30it/s]Batches:  61%|██████▏   | 117/191 [00:14<00:06, 12.30it/s]Batches:  62%|██████▏   | 119/191 [00:15<00:05, 12.29it/s]Batches:  63%|██████▎   | 121/191 [00:15<00:05, 12.56it/s]Batches:  64%|██████▍   | 123/191 [00:15<00:05, 13.02it/s]Batches:  65%|██████▌   | 125/191 [00:15<00:05, 13.13it/s]Batches:  66%|██████▋   | 127/191 [00:15<00:04, 13.35it/s]Batches:  68%|██████▊   | 129/191 [00:15<00:04, 12.98it/s]Batches:  69%|██████▊   | 131/191 [00:16<00:04, 12.84it/s]Batches:  70%|██████▉   | 133/191 [00:16<00:04, 13.52it/s]Batches:  71%|███████   | 135/191 [00:16<00:04, 13.84it/s]Batches:  72%|███████▏  | 137/191 [00:16<00:03, 14.09it/s]Batches:  73%|███████▎  | 139/191 [00:16<00:03, 14.00it/s]Batches:  74%|███████▍  | 141/191 [00:16<00:03, 14.04it/s]Batches:  75%|███████▍  | 143/191 [00:16<00:03, 13.52it/s]Batches:  76%|███████▌  | 145/191 [00:17<00:03, 13.39it/s]Batches:  77%|███████▋  | 147/191 [00:17<00:03, 12.74it/s]Batches:  78%|███████▊  | 149/191 [00:17<00:03, 12.74it/s]Batches:  79%|███████▉  | 151/191 [00:17<00:03, 12.71it/s]Batches:  80%|████████  | 153/191 [00:17<00:02, 12.99it/s]Batches:  81%|████████  | 155/191 [00:17<00:02, 12.55it/s]Batches:  82%|████████▏ | 157/191 [00:18<00:02, 13.31it/s]Batches:  83%|████████▎ | 159/191 [00:18<00:02, 13.88it/s]Batches:  84%|████████▍ | 161/191 [00:18<00:02, 13.66it/s]Batches:  85%|████████▌ | 163/191 [00:18<00:02, 13.62it/s]Batches:  86%|████████▋ | 165/191 [00:18<00:01, 13.74it/s]Batches:  87%|████████▋ | 167/191 [00:18<00:01, 14.15it/s]Batches:  88%|████████▊ | 169/191 [00:18<00:01, 14.88it/s]Batches:  90%|████████▉ | 171/191 [00:18<00:01, 14.47it/s]Batches:  91%|█████████ | 173/191 [00:19<00:01, 14.72it/s]Batches:  92%|█████████▏| 175/191 [00:19<00:01, 14.98it/s]Batches:  93%|█████████▎| 177/191 [00:19<00:00, 15.10it/s]Batches:  94%|█████████▎| 179/191 [00:19<00:00, 16.04it/s]Batches:  95%|█████████▍| 181/191 [00:19<00:00, 16.11it/s]Batches:  96%|█████████▌| 183/191 [00:19<00:00, 16.49it/s]Batches:  97%|█████████▋| 185/191 [00:19<00:00, 16.68it/s]Batches:  98%|█████████▊| 187/191 [00:19<00:00, 17.44it/s]Batches:  99%|█████████▉| 190/191 [00:20<00:00, 19.02it/s]Batches: 100%|██████████| 191/191 [00:20<00:00,  9.51it/s]
2023-05-13 08:09:55,134 - BERTopic - Transformed documents to Embeddings
2023-05-13 08:09:57,705 - BERTopic - Reduced dimensionality
2023-05-13 08:09:58,621 - BERTopic - Clustered reduced embeddings
2023-05-13 08:10:00,407 - BERTopic - Reduced number of topics from 88 to 30
Topic Count Name
0 -1 1659 -1_use_past_things_reason
1 0 1560 0_child_score_use_problems
2 1 550 1_30_days_age_things
3 2 277 2_disorder_anxiety_diagnosis_psychiatric
4 3 200 3_weekend_weekdays_parent_monitor
5 4 196 4_heart_stage_dominant_participant
6 5 188 5_history_cause_medical_disease
7 6 154 6_rank_percentile_comprehension_index
8 7 128 7_driving_intoxicated_smoke_mother
9 8 122 8_eating_eat_food_foods
10 9 116 9_used_medication_planned_occasions
11 10 110 10_age_grandmother_death_current
12 11 100 11_plate_picture_span_sort
13 12 87 12_times_recovering_obtaining_great
14 13 85 13_duration_weeks_longest_sibling
15 14 82 14_sleep_night_asleep_sleeps
16 15 75 15_total_score_composite_pri
17 16 51 16_parents_argue_arguments_usually
18 17 45 17_craving_strong_desire_obsessions
19 18 34 18_student_peers_school_class
20 19 33 19_tolerance_substance_marked_concentration
21 20 33 20_restless_seated_remaining_difficulty
22 21 31 21_weight_mass_gain_birth
23 22 29 22_frightened_afraid_like_feels
24 23 28 23_pcp_ketamine_dust_angel
25 24 27 24_withdrawal_substance_symptoms_present
26 25 25 25_texting_messaging_weekdays_weekend
27 26 24 26_hyperactivity_deficit_autism_attention
28 27 23 27_dose_highest_medication_
29 28 21 28_glue_aerosols_inhalants_high

Visualize Topics

 
topic_model.visualize_topics()

Visualizing word frequency

topic_model.visualize_barchart(top_n_topics=10)

Visualize topic similarity

# topic_model.update_topics(data, n_gram_range=(1, 2))
# topic_model.reduce_topics(data, nr_topics=3)
topic_model.visualize_heatmap(n_clusters=5, width=1000, height=1000)

Find topics

topic_model.find_topics("anxiety")
([2, 22, 17, 5, 26],
 [0.7407424308803141,
  0.5320427976392154,
  0.5257778782422755,
  0.5054033168488012,
  0.4752042501825976])

Visualize hierarchy

topic_model.visualize_hierarchy()

Check out this other tutorial using Top2Vec

https://github.com/ddangelov/Top2Vec

Sentence similarity across measures

Code taken from here: https://www.sbert.net/docs/usage/semantic_textual_similarity.html

Paraphrase Mining

from sentence_transformers import SentenceTransformer, util

# https://www.sbert.net/examples/applications/paraphrase-mining/README.html

model = SentenceTransformer('all-MiniLM-L6-v2')

paraphrases = util.paraphrase_mining(model, data, show_progress_bar=True)

for paraphrase in paraphrases[0:1]:
    score, i, j = paraphrase
    print("{} \t\t {} \t\t Score: {:.4f}".format(data[i], data[j], score))
    
df_paraphrases = pd.DataFrame(paraphrases, columns=['score', 'idx1', 'idx2'])
Batches:   0%|          | 0/191 [00:00<?, ?it/s]Batches:   1%|          | 1/191 [00:01<03:35,  1.14s/it]Batches:   1%|          | 2/191 [00:01<01:55,  1.63it/s]Batches:   2%|▏         | 3/191 [00:01<01:20,  2.35it/s]Batches:   2%|▏         | 4/191 [00:01<01:01,  3.02it/s]Batches:   3%|▎         | 5/191 [00:01<00:51,  3.61it/s]Batches:   3%|▎         | 6/191 [00:02<00:44,  4.15it/s]Batches:   4%|▎         | 7/191 [00:02<00:39,  4.71it/s]Batches:   4%|▍         | 8/191 [00:02<00:35,  5.17it/s]Batches:   5%|▍         | 9/191 [00:02<00:32,  5.63it/s]Batches:   5%|▌         | 10/191 [00:02<00:30,  6.01it/s]Batches:   6%|▌         | 11/191 [00:02<00:28,  6.36it/s]Batches:   6%|▋         | 12/191 [00:02<00:27,  6.51it/s]Batches:   7%|▋         | 13/191 [00:03<00:28,  6.34it/s]Batches:   7%|▋         | 14/191 [00:03<00:26,  6.70it/s]Batches:   8%|▊         | 15/191 [00:03<00:26,  6.65it/s]Batches:   8%|▊         | 16/191 [00:03<00:26,  6.66it/s]Batches:   9%|▉         | 17/191 [00:03<00:25,  6.78it/s]Batches:   9%|▉         | 18/191 [00:03<00:26,  6.57it/s]Batches:  10%|▉         | 19/191 [00:04<00:26,  6.56it/s]Batches:  10%|█         | 20/191 [00:04<00:25,  6.75it/s]Batches:  11%|█         | 21/191 [00:04<00:25,  6.54it/s]Batches:  12%|█▏        | 22/191 [00:04<00:25,  6.65it/s]Batches:  12%|█▏        | 23/191 [00:04<00:25,  6.66it/s]Batches:  13%|█▎        | 24/191 [00:04<00:24,  6.71it/s]Batches:  13%|█▎        | 25/191 [00:04<00:24,  6.77it/s]Batches:  14%|█▎        | 26/191 [00:05<00:22,  7.23it/s]Batches:  14%|█▍        | 27/191 [00:05<00:24,  6.79it/s]Batches:  15%|█▍        | 28/191 [00:05<00:24,  6.78it/s]Batches:  15%|█▌        | 29/191 [00:05<00:23,  6.90it/s]Batches:  16%|█▌        | 30/191 [00:05<00:21,  7.34it/s]Batches:  16%|█▌        | 31/191 [00:05<00:22,  7.24it/s]Batches:  17%|█▋        | 32/191 [00:05<00:21,  7.42it/s]Batches:  17%|█▋        | 33/191 [00:06<00:21,  7.48it/s]Batches:  18%|█▊        | 34/191 [00:06<00:20,  7.55it/s]Batches:  19%|█▉        | 36/191 [00:06<00:19,  7.79it/s]Batches:  19%|█▉        | 37/191 [00:06<00:18,  8.13it/s]Batches:  20%|█▉        | 38/191 [00:06<00:18,  8.29it/s]Batches:  20%|██        | 39/191 [00:06<00:17,  8.61it/s]Batches:  21%|██        | 40/191 [00:06<00:17,  8.41it/s]Batches:  21%|██▏       | 41/191 [00:06<00:17,  8.61it/s]Batches:  22%|██▏       | 42/191 [00:07<00:17,  8.28it/s]Batches:  23%|██▎       | 43/191 [00:07<00:18,  7.98it/s]Batches:  23%|██▎       | 44/191 [00:07<00:18,  8.05it/s]Batches:  24%|██▎       | 45/191 [00:07<00:17,  8.45it/s]Batches:  25%|██▍       | 47/191 [00:07<00:16,  8.59it/s]Batches:  26%|██▌       | 49/191 [00:07<00:16,  8.84it/s]Batches:  27%|██▋       | 51/191 [00:08<00:15,  8.84it/s]Batches:  27%|██▋       | 52/191 [00:08<00:16,  8.57it/s]Batches:  28%|██▊       | 53/191 [00:08<00:16,  8.29it/s]Batches:  29%|██▉       | 55/191 [00:08<00:15,  8.86it/s]Batches:  30%|██▉       | 57/191 [00:08<00:14,  9.10it/s]Batches:  31%|███       | 59/191 [00:08<00:13,  9.50it/s]Batches:  32%|███▏      | 61/191 [00:09<00:13,  9.98it/s]Batches:  33%|███▎      | 63/191 [00:09<00:12, 10.22it/s]Batches:  34%|███▍      | 65/191 [00:09<00:12, 10.36it/s]Batches:  35%|███▌      | 67/191 [00:09<00:12, 10.22it/s]Batches:  36%|███▌      | 69/191 [00:09<00:11, 10.26it/s]Batches:  37%|███▋      | 71/191 [00:10<00:11, 10.41it/s]Batches:  38%|███▊      | 73/191 [00:10<00:11, 10.35it/s]Batches:  39%|███▉      | 75/191 [00:10<00:11, 10.44it/s]Batches:  40%|████      | 77/191 [00:10<00:10, 10.94it/s]Batches:  41%|████▏     | 79/191 [00:10<00:09, 11.21it/s]Batches:  42%|████▏     | 81/191 [00:11<00:09, 11.08it/s]Batches:  43%|████▎     | 83/191 [00:11<00:09, 11.37it/s]Batches:  45%|████▍     | 85/191 [00:11<00:09, 11.29it/s]Batches:  46%|████▌     | 87/191 [00:11<00:09, 11.06it/s]Batches:  47%|████▋     | 89/191 [00:11<00:08, 11.36it/s]Batches:  48%|████▊     | 91/191 [00:11<00:08, 11.35it/s]Batches:  49%|████▊     | 93/191 [00:12<00:08, 11.95it/s]Batches:  50%|████▉     | 95/191 [00:12<00:08, 11.92it/s]Batches:  51%|█████     | 97/191 [00:12<00:07, 12.00it/s]Batches:  52%|█████▏    | 99/191 [00:12<00:07, 12.26it/s]Batches:  53%|█████▎    | 101/191 [00:12<00:07, 12.11it/s]Batches:  54%|█████▍    | 103/191 [00:12<00:07, 12.10it/s]Batches:  55%|█████▍    | 105/191 [00:13<00:07, 12.05it/s]Batches:  56%|█████▌    | 107/191 [00:13<00:06, 12.52it/s]Batches:  57%|█████▋    | 109/191 [00:13<00:06, 12.50it/s]Batches:  58%|█████▊    | 111/191 [00:13<00:06, 12.48it/s]Batches:  59%|█████▉    | 113/191 [00:13<00:06, 12.13it/s]Batches:  60%|██████    | 115/191 [00:13<00:06, 12.37it/s]Batches:  61%|██████▏   | 117/191 [00:14<00:05, 12.40it/s]Batches:  62%|██████▏   | 119/191 [00:14<00:05, 12.61it/s]Batches:  63%|██████▎   | 121/191 [00:14<00:05, 13.18it/s]Batches:  64%|██████▍   | 123/191 [00:14<00:05, 13.50it/s]Batches:  65%|██████▌   | 125/191 [00:14<00:04, 13.60it/s]Batches:  66%|██████▋   | 127/191 [00:14<00:04, 13.83it/s]Batches:  68%|██████▊   | 129/191 [00:14<00:04, 13.46it/s]Batches:  69%|██████▊   | 131/191 [00:15<00:04, 13.31it/s]Batches:  70%|██████▉   | 133/191 [00:15<00:04, 13.89it/s]Batches:  71%|███████   | 135/191 [00:15<00:03, 14.27it/s]Batches:  72%|███████▏  | 137/191 [00:15<00:03, 14.45it/s]Batches:  73%|███████▎  | 139/191 [00:15<00:03, 14.21it/s]Batches:  74%|███████▍  | 141/191 [00:15<00:03, 14.24it/s]Batches:  75%|███████▍  | 143/191 [00:15<00:03, 13.68it/s]Batches:  76%|███████▌  | 145/191 [00:16<00:03, 13.60it/s]Batches:  77%|███████▋  | 147/191 [00:16<00:03, 13.99it/s]Batches:  78%|███████▊  | 149/191 [00:16<00:02, 14.03it/s]Batches:  79%|███████▉  | 151/191 [00:16<00:02, 14.52it/s]Batches:  80%|████████  | 153/191 [00:16<00:02, 14.63it/s]Batches:  81%|████████  | 155/191 [00:16<00:02, 14.46it/s]Batches:  82%|████████▏ | 157/191 [00:16<00:02, 15.11it/s]Batches:  83%|████████▎ | 159/191 [00:16<00:02, 15.29it/s]Batches:  84%|████████▍ | 161/191 [00:17<00:01, 15.48it/s]Batches:  85%|████████▌ | 163/191 [00:17<00:01, 15.61it/s]Batches:  86%|████████▋ | 165/191 [00:17<00:01, 15.69it/s]Batches:  87%|████████▋ | 167/191 [00:17<00:01, 16.02it/s]Batches:  88%|████████▊ | 169/191 [00:17<00:01, 16.62it/s]Batches:  90%|████████▉ | 171/191 [00:17<00:01, 16.87it/s]Batches:  91%|█████████ | 173/191 [00:17<00:01, 16.99it/s]Batches:  92%|█████████▏| 175/191 [00:17<00:00, 17.18it/s]Batches:  93%|█████████▎| 177/191 [00:17<00:00, 17.49it/s]Batches:  94%|█████████▎| 179/191 [00:18<00:00, 17.55it/s]Batches:  95%|█████████▍| 181/191 [00:18<00:00, 18.10it/s]Batches:  96%|█████████▌| 183/191 [00:18<00:00, 18.59it/s]Batches:  97%|█████████▋| 185/191 [00:18<00:00, 18.25it/s]Batches:  98%|█████████▊| 188/191 [00:18<00:00, 19.33it/s]Batches: 100%|██████████| 191/191 [00:18<00:00, 21.54it/s]Batches: 100%|██████████| 191/191 [00:18<00:00, 10.23it/s]
52. Feels too guilty         52. Feels too guilty        Score: 1.0000

Compare all pairwise combinations of clinical questionnaires

This takes some time so I’ve run it on openmind:

sbatch /om2/user/maedbh/healthy_brain_network/hpc_scripts/run_item_analysis.sh

dendrogram clustering: https://www.python-graph-gallery.com/404-dendrogram-with-heat-map (https://www.python-graph-gallery.com/)

# load sentence similarity
#data_dir = '/Users/maedbhking/Documents/healthy_brain_network/data/interim/subtypes'
data_dir = '/Users/maedbhking/Documents/hbn_data'
fname = 'sentence-similarity_HBN_mean_scores-all-MiniLM-L6-v2.csv'

df = pd.read_csv(os.path.join(data_dir, fname))

Generate heatmap of similarity between clinical questionnaires

# get measures
measures = df['idx1'].unique()

# get domains
domains = []
for abbrev in df_dict['datadic'].unique():
    domain_name = df_dict[df_dict['datadic']==abbrev]['domains'].unique()[0]
    domains.append(domain_name)
domains = np.array(domains)

# get assessments
assessments = []
for abbrev in df_dict['datadic'].unique():
    assessment_name = df_dict[df_dict['datadic']==abbrev]['assessment'].unique()[0]
    assessments.append(assessment_name)
assessments = np.array(assessments)
assessments = [f[0] for f in assessments] # abbrev name

tensor = np.ones((len(measures), len(measures)))
for i, idx1 in enumerate(measures):
    for ii, idx2 in enumerate(measures):
        
        # get score
        score = df[(df['idx1']==idx1) & (df['idx2']==idx2)]['mean_score']
        
        if score.shape[0]==0:
            score = df[(df['idx1']==idx2) & (df['idx2']==idx1)]['mean_score']
        
        tensor[i,ii] = score;

tensor_df = pd.DataFrame(tensor, columns=assessments)

tensor_df['new_col'] = assessments
tensor_df = tensor_df.set_index('new_col')

# drop rows and columsn that contain NaN
tensor_df = tensor_df.drop(['n'], axis=0).drop(['n'], axis=1)
g = sns.clustermap(tensor_df, 
               standard_scale=1, 
               metric="euclidean", 
               robust=True, 
               row_cluster=True,
               #method="ward",
               #center=1,
               yticklabels=True, 
               xticklabels=True,
              ); #method="ward"

g.ax_heatmap.yaxis.set_ticks_position("right")

plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), fontsize=7, rotation=45)
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), fontsize=7, rotation=45)
plt.savefig(os.path.join(save_dir, 'All_Measures_heatmap-assessment.png'))
plt.show()

Take the diagonal

Majority of measures are about .4 correlated in terms of their semantic similarity (e.g., YSR-YSR)

plotting_style()
sns.distplot(np.diagonal(np.array(tensor_df)))
plt.savefig(os.path.join(save_dir, 'All_measures_diagonal.png'))

Cluster heatmap based on Parent, Child, Teacher assessment

# get tensors

def get_tensor_subset(
    df,
    df_dict,
    measures,
    domains,
    assessment='Child Measures',
    labels='domains'
    ):

    bool_all = []
    for measure in measures:
        # check assessment
        check = df_dict[df_dict['datadic']==measure]['assessment'].unique()

        if check==assessment:
            idx = True
        else:
            idx = False
        bool_all.append(idx)

    # get measures
    measures_subset = measures[bool_all]
    domains_subset = domains[bool_all]
    
    tensor = np.ones((len(measures_subset), len(measures_subset)))
    for i, idx1 in enumerate(measures_subset):
        for ii, idx2 in enumerate(measures_subset):

            # get score
            score = df[(df['idx1']==idx1) & (df['idx2']==idx2)]['mean_score']

            if score.shape[0]==0:
                score = df[(df['idx1']==idx2) & (df['idx2']==idx1)]['mean_score']

            tensor[i,ii] = score;
    
    if labels=='domains':
        tensor_subset = pd.DataFrame(tensor, columns=domains_subset)
        tensor_subset['new_col'] = domains_subset
    elif labels=='measures':
        tensor_subset = pd.DataFrame(tensor, columns=measures_subset)
        tensor_subset['new_col'] = measures_subset
    
    tensor_subset = tensor_subset.set_index('new_col')
    
    return tensor_subset
# get tensor

tensor_subset = get_tensor_subset(df, df_dict, measures, domains, assessment='Child Measures', labels='measures')
g = sns.clustermap(tensor_subset, 
               #standard_scale=1, 
               metric="euclidean", 
               robust=True, 
               row_cluster=True,
               #method="ward",
               #center=1,
               yticklabels=True, 
               xticklabels=True,
               #linewidths=0.004
              );


g.ax_heatmap.yaxis.set_ticks_position("right")

plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), fontsize=10)
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), fontsize=10)
plt.savefig(os.path.join(save_dir, 'Child_Measures_heatmap-measures.png'))
plt.show()

plotting_style()
sns.distplot(np.diagonal(np.array(tensor_subset)))
<Axes: ylabel='Density'>

from hbn.models.item_analysis import sentence_similarity_single_pair

tensor_dict = sentence_similarity_single_pair(
    data_dictionary=df_dict, 
    transformer='all-MiniLM-L6-v2', 
    pairwise_measures=('YSR', 'CBCL')
    )
g = sns.clustermap(tensor_dict['cosine_scores'], 
               #standard_scale=1, 
               metric="euclidean", 
               robust=True, 
               row_cluster=True,
               #method="ward",
               #center=1,
               yticklabels=True, 
               xticklabels=True,
               #linewidths=0.004
              );


g.ax_heatmap.yaxis.set_ticks_position("right")

plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), fontsize=10)
plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), fontsize=10)
plt.show()