In [28]:
import dill
import pickle
import numpy as np

# Load the DDI adjacency matrix
with open("ddi_A_final.pkl", "rb") as f:
    ddi_matrix = dill.load(f)

# Load the vocabulary mappings for diagnoses, procedures, and medications
with open("voc_final.pkl", "rb") as f:
    voc_final = dill.load(f)
diag_voc, pro_voc, med_voc = voc_final["diag_voc"], voc_final["pro_voc"], voc_final["med_voc"]

# Load patient records with diagnoses, procedures, and medications
with open("records_final.pkl", "rb") as f:
    records = dill.load(f)


In [10]:
# Count unique drugs based on ddi_matrix size
num_drugs = ddi_matrix.shape[0]

# Count unique diagnoses, procedures, and medications
num_diagnoses = len(diag_voc.idx2word) if hasattr(diag_voc, "idx2word") else len(diag_voc)
num_procedures = len(pro_voc.idx2word) if hasattr(pro_voc, "idx2word") else len(pro_voc)
num_medications = len(med_voc.idx2word) if hasattr(med_voc, "idx2word") else len(med_voc)

# Total number of unique entities
num_entities = num_drugs + num_diagnoses + num_procedures + num_medications
print(f"Total number of entities: {num_entities}")

Total number of entities: 3612


In [32]:
ddi_matrix 

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [34]:
voc_final

<__main__.Voc at 0x148c1cb84580>

In [11]:
relations = {"<interacts_with>", "<has_diagnosis>", "<has_procedure>", "<has_medication>"}
num_relations = len(relations)
print(f"Total number of unique relations: {num_relations}")


Total number of unique relations: 4


In [2]:
import dill
import pickle
import numpy as np


with open("records_final.pkl", "rb") as f:
    records = dill.load(f)

with open("voc_final.pkl", "rb") as f:
    voc_final = dill.load(f)
diag_voc, pro_voc, med_voc = voc_final["diag_voc"], voc_final["pro_voc"], voc_final["med_voc"]

with open("ddi_A_final.pkl", "rb") as f:
    ddi_matrix = dill.load(f)

num_users = len(records)  #
num_items = len(med_voc.idx2word) + len(diag_voc.idx2word) + len(pro_voc.idx2word)


# For each patient, count the total number of unique diagnoses, procedures, and medications
interactions = 0
for record in records:
    if len(record) > 0:  
        interactions += len(record[0])
    if len(record) > 1: 
        interactions += len(record[1])
    if len(record) > 2: 
        interactions += len(record[2])

# Density
density = interactions / (num_users * num_items)

# Knowledge Graph Statistics
# Entities
num_entities = num_items + ddi_matrix.shape[0]  # Including drugs in ddi_matrix

# Relations
# Assuming relations include `<interacts_with>`, `<has_diagnosis>`, `<has_procedure>`, `<has_medication>`
num_relations = 4

# Triplets: Sum of interactions and DDI matrix non-zero entries
ddi_interactions = np.count_nonzero(ddi_matrix)  # Drug-drug interactions
triplets = interactions + ddi_interactions

# Display results
print("Statistics:")
print(f"# Users: {num_users}")
print(f"# Items: {num_items}")
print(f"# Interactions: {interactions}")
print(f"# Density: {density:.2e}")
print("Knowledge Graph:")
print(f"# Entities: {num_entities}")
print(f"# Relations: {num_relations}")
print(f"# Triplets: {triplets}")

Statistics:
# Users: 6350
# Items: 3500
# Interactions: 40326
# Density: 1.81e-03
Knowledge Graph:
# Entities: 3612
# Relations: 4
# Triplets: 41000


In [4]:
import pickle
from scipy.sparse import coo_matrix
from sklearn.model_selection import train_test_split


num_users = len(records)
num_diagnoses = len(diag_voc.idx2word)
num_procedures = len(pro_voc.idx2word)
num_medications = len(med_voc.idx2word)

# Total items = diagnoses + procedures + medications
num_items = num_diagnoses + num_procedures + num_medications


rows, cols, data = [], [], []
for user_id, record in enumerate(records):
    if len(record) > 0:  
        for diag_list in record[0]:  
            for diag in diag_list:
                if diag < num_diagnoses:  
                    rows.append(user_id)
                    cols.append(diag)
                    data.append(1)
    if len(record) > 1:  
        for proc_list in record[1]:  
            for proc in proc_list:
                if proc < num_procedures:  
                    rows.append(user_id)
                    cols.append(num_diagnoses + proc) 
                    data.append(1)
    if len(record) > 2:  
        for med_list in record[2]:  
            for med in med_list:
                if med < num_medications:  
                    rows.append(user_id)
                    cols.append(num_diagnoses + num_procedures + med)  
                    data.append(1)


interaction_matrix = coo_matrix((data, (rows, cols)), shape=(num_users, num_items))

# Split into train and test matrices
train_matrix, test_matrix = train_test_split(interaction_matrix, test_size=0.2, random_state=42)

# Save the matrices
with open("trnMat.pkl", "wb") as f:
    pickle.dump(train_matrix, f)

with open("tstMat.pkl", "wb") as f:
    pickle.dump(test_matrix, f)


In [7]:
num_users

6350

In [8]:
num_items

3500

In [9]:
train_matrix

<5080x3500 sparse matrix of type '<class 'numpy.int64'>'
	with 361680 stored elements in Compressed Sparse Row format>

In [10]:
test_matrix

<1270x3500 sparse matrix of type '<class 'numpy.int64'>'
	with 91981 stored elements in Compressed Sparse Row format>

In [11]:
import pickle
import numpy as np
from scipy.sparse import coo_matrix

def load_and_inspect(filename):
    with open(filename, "rb") as f:
        matrix = pickle.load(f)

    # convert the sparse matrix to dense or inspect specific properties
    if isinstance(matrix, coo_matrix):
        print(f"Matrix loaded from {filename}:")
        print(f"Shape: {matrix.shape}")
        print(f"Number of non-zero entries: {matrix.nnz}")
        print("Sample non-zero entries (row, col, data):")
        
        
        rows, cols, data = matrix.row, matrix.col, matrix.data
        for i in range(min(10, matrix.nnz)):
            print(f"({rows[i]}, {cols[i]}, {data[i]})")

       
        if matrix.shape[0] <= 10 and matrix.shape[1] <= 10:
            print("Full dense matrix representation:")
            print(matrix.toarray())
    else:
        print(f"{filename} is not a sparse matrix or in COO format.")
        print(matrix)

# Inspect the train and test matrices
load_and_inspect("trnMat.pkl")
load_and_inspect("tstMat.pkl")

trnMat.pkl is not a sparse matrix or in COO format.
  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 4)	1
  (0, 6)	1
  (0, 12)	1
  (0, 13)	1
  (0, 14)	2
  (0, 15)	2
  (0, 21)	1
  (0, 26)	1
  (0, 31)	1
  (0, 33)	1
  (0, 40)	1
  (0, 43)	1
  (0, 44)	1
  (0, 76)	1
  (0, 82)	1
  (0, 87)	1
  (0, 320)	1
  (0, 321)	1
  (0, 717)	1
  (0, 1958)	1
  (0, 1960)	1
  (0, 1961)	1
  :	:
  (5079, 1987)	1
  (5079, 1990)	1
  (5079, 1995)	1
  (5079, 1996)	1
  (5079, 1997)	1
  (5079, 1998)	1
  (5079, 1999)	1
  (5079, 2003)	1
  (5079, 2006)	1
  (5079, 2014)	1
  (5079, 2025)	1
  (5079, 2031)	1
  (5079, 2037)	1
  (5079, 2041)	1
  (5079, 2048)	1
  (5079, 2056)	1
  (5079, 2061)	1
  (5079, 2067)	1
  (5079, 2072)	1
  (5079, 2098)	1
  (5079, 2113)	1
  (5079, 2136)	1
  (5079, 2146)	1
  (5079, 2248)	1
  (5079, 2304)	1
tstMat.pkl is not a sparse matrix or in COO format.
  (0, 0)	1
  (0, 1)	1
  (0, 2)	2
  (0, 3)	1
  (0, 6)	1
  (0, 7)	1
  (0, 8)	1
  (0, 12)	1
  (0, 13)	2
  (0, 15)	1
  (0, 17)	2
  (0, 18)	2
  (0, 21)	1
  (0, 22)	1


In [40]:
# Display the first 10 lines 
file_path = "kg.txt"
num_lines_to_display = 50


try:
    with open(file_path, "r") as file:
        content_lines = [next(file).strip() for _ in range(num_lines_to_display)]
    print("Displaying the first 10 lines from kg.txt:\n")
    for line in content_lines:
        print(line)
    print("\nFile inspection complete.")
except FileNotFoundError:
    print("kg.txt file not found. Please ensure it was generated correctly.")

Displaying the first 10 lines from kg.txt:

User_0 <has_diagnosis> Diagnosis_[0, 1, 2, 3, 4, 5, 6, 7]
User_0 <has_diagnosis> Diagnosis_[0, 1, 2]
User_0 <has_diagnosis> Diagnosis_[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
User_0 <has_procedure> Procedure_[8, 9, 10, 7]
User_0 <has_procedure> Procedure_[3, 4, 1]
User_0 <has_procedure> Procedure_[0, 1, 2, 3, 5, 4, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18]
User_1 <has_diagnosis> Diagnosis_[11, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
User_1 <has_diagnosis> Diagnosis_[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
User_1 <has_diagnosis> Diagnosis_[3, 4, 12, 14, 13, 5, 19, 20, 21, 22, 23, 24, 25, 18, 1, 26, 2, 16, 27, 9, 28, 6, 29, 30]
User_1 <has_procedure> Procedure_[28, 29, 14, 18, 30, 1, 31, 32, 33, 34, 35, 36, 21, 37, 27, 25, 20, 38, 39, 40, 41]
User_1 <has_procedure> Procedure_[12, 17, 18]
User_1 <has_procedure> Procedure_[0, 3, 4, 6, 8, 19, 22, 18, 1, 2, 9, 27, 26, 31, 32, 33, 7, 34, 35, 17]
User_2 <has_

In [60]:
# update KG using actual ID
import dill
import pickle

# Load the records and voc_final to access the vocabularies for diagnoses, procedures, and medications
with open("records_final.pkl", "rb") as f:
    records = dill.load(f)

with open("voc_final.pkl", "rb") as f:
    voc_final = dill.load(f)
    
diag_voc, pro_voc, med_voc = voc_final["diag_voc"], voc_final["pro_voc"], voc_final["med_voc"]

# Use the actual IDs for users, diagnoses, procedures, and medications
triplets = []

# Map user-diagnosis-procedure-medication interactions to triplets
for user_id, record in enumerate(records):
    # Use actual user_id as it appears in the records
    user_int_id = user_id  # Directly use the index of the user as the ID

    # Diagnoses
    if len(record) > 0:
        for diag_list in record[0]:
            for diag in diag_list:
                diag_int_id = diag
                triplets.append(f"{user_int_id} 0 {diag_int_id}")

    # Procedures
    if len(record) > 1:
        for proc_list in record[1]:
            for proc in proc_list:
                proc_int_id = proc
                triplets.append(f"{user_int_id} 0 {proc_int_id}")

    # Medications
    if len(record) > 2:
        for med_list in record[2]:
            for med in med_list:
                med_int_id = med
                triplets.append(f"{user_int_id} 0 {med_int_id}")


with open("kgmm.txt", "w") as kgali_file:
    for triplet in triplets:
        kgali_file.write(triplet + "\n")

print("kgmm.txt file created with the actual IDs.")

kgmm.txt file created with the actual IDs.


In [3]:
import dill
from collections import Counter


with open("records_final.pkl", "rb") as f:
    records = dill.load(f)

with open("voc_final.pkl", "rb") as f:
    voc_final = dill.load(f)


diag_voc, pro_voc, med_voc = voc_final["diag_voc"], voc_final["pro_voc"], voc_final["med_voc"]


user_count = len(records)
diagnosis_ids = set()
procedure_ids = set()
medication_ids = set()


for user_id, record in enumerate(records):
    if len(record) > 0:
        for diag_list in record[0]:
            diagnosis_ids.update(diag_list)
    

    if len(record) > 1:
        for proc_list in record[1]:
            procedure_ids.update(proc_list)
    

    if len(record) > 2:
        for med_list in record[2]:
            medication_ids.update(med_list)


results = {
    "Number of users": user_count,
    "Number of unique diagnoses": len(diagnosis_ids),
    "Number of unique procedures": len(procedure_ids),
    "Number of unique medications": len(medication_ids)
}

results

{'Number of users': 6350,
 'Number of unique diagnoses': 1917,
 'Number of unique procedures': 1898,
 'Number of unique medications': 1593}

In [14]:
# check the finalized KG
kg_file_path = "kgmm.txt"


with open(kg_file_path, "r") as f:
    for _ in range(10):
        line = f.readline().strip()
        print(line)

print("\nFile inspection complete.")


0 0 0
0 0 1
0 0 2
0 0 3
0 0 4
0 0 5
0 0 6
0 0 7
0 0 0
0 0 1

File inspection complete.


## Further Process the KG by different index, duplictaes are found from the above 

In [126]:
import pickle
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.model_selection import train_test_split
import dill

# Load the necessary data files
with open("records_final.pkl", "rb") as f:
    records = dill.load(f)  # This file contains patient EHR records: diagnoses, procedures, and medications

with open("voc_final.pkl", "rb") as f:
    voc_final = dill.load(f)
diag_voc, pro_voc, med_voc = voc_final["diag_voc"], voc_final["pro_voc"], voc_final["med_voc"]

# Load DDI matrix for drug-drug interactions
with open("ddi_A_final.pkl", "rb") as f:
    ddi_matrix = dill.load(f)

# Prepare the indices
num_diagnoses = 3000
num_procedures = 3000
num_medications = 3000
diagnosis_start_id = 0
procedure_start_id = num_diagnoses
medication_start_id = num_diagnoses + num_procedures

# 1. Generate kg1.txt file
# Open the file to write the knowledge graph triplets
with open("kg1.txt", "w") as kg_file:
    # Iterate over each patient's record
    for user_id, record in enumerate(records):
        # Use sets to avoid duplicate entries
        unique_diagnoses = set()
        unique_procedures = set()
        unique_medications = set()

        # Flatten the list for diagnoses and add to the set
        if len(record) > 0:  # Diagnoses
            for diag_list in record[0]:  # Each entry in record[0] is a list
                unique_diagnoses.update(diag_list)  # Add all items from diag_list to the set

        # Write unique diagnoses to kg.txt
        for diag in unique_diagnoses:
            kg_file.write(f"Patient_{user_id} <has_diagnosis> Diagnosis_{diag}\n")

        # Flatten the list for procedures and add to the set
        if len(record) > 1:  # Procedures
            for proc_list in record[1]:  # Each entry in record[1] is a list
                unique_procedures.update(proc_list)  # Add all items from proc_list to the set

        # Write unique procedures to kg.txt
        for proc in unique_procedures:
            kg_file.write(f"Patient_{user_id} <has_procedure> Procedure_{proc}\n")

        # Flatten the list for medications and add to the set
        if len(record) > 2:  # Medications
            for med_list in record[2]:  # Each entry in record[2] is a list
                unique_medications.update(med_list)  # Add all items from med_list to the set

        # Write unique medications to kg.txt
        for med in unique_medications:
            kg_file.write(f"Patient_{user_id} <has_medication> Medication_{med}\n")

    # Include drug-drug interactions with relation type ID 3
    for i in range(ddi_matrix.shape[0]):
        for j in range(ddi_matrix.shape[1]):
            if ddi_matrix[i, j] != 0:
                kg_file.write(f"Drug_{i} 3 Drug_{j}\n")

print("kg1.txt file has been generated without duplicates, including 'interacts_with' relations.")


# 2. Generate trnMat.pkl and tstMat.pkl files (user-item interaction matrix)
rows, cols, data = [], [], []

# Iterate over each patient's record
for user_id, record in enumerate(records):
    # Use sets to collect unique diagnoses, procedures, and medications for each patient
    unique_diagnoses = set()
    unique_procedures = set()
    unique_medications = set()

    # Add diagnoses to the set
    if len(record) > 0:  # Diagnoses
        for diag_list in record[0]:  # Each entry in record[0] is a list
            unique_diagnoses.update(diag_list)

    # Add unique diagnoses to the matrix entries
    for diag in unique_diagnoses:
        rows.append(user_id)
        cols.append(diagnosis_start_id + diag)
        data.append(1)

    # Add procedures to the set
    if len(record) > 1: 
        for proc_list in record[1]:  
            unique_procedures.update(proc_list)

   
    for proc in unique_procedures:
        rows.append(user_id)
        cols.append(procedure_start_id + proc)
        data.append(1)

    
    if len(record) > 2:  
        for med_list in record[2]:  
            unique_medications.update(med_list)

   
    for med in unique_medications:
        rows.append(user_id)
        cols.append(medication_start_id + med)
        data.append(1)


num_users = len(records)
num_items = diagnosis_start_id + len(diag_voc.idx2word) + 3000 + 3000
interaction_matrix = coo_matrix((data, (rows, cols)), shape=(num_users, num_items))

# Split the data into train and test sets
train_matrix, test_matrix = train_test_split(interaction_matrix, test_size=0.2, random_state=42)


with open("trnMat1.pkl", "wb") as f:
    pickle.dump(train_matrix, f)

with open("tstMat1.pkl", "wb") as f:
    pickle.dump(test_matrix, f)

print("trnMat1.pkl and tstMat1.pkl files have been generated with unique entries.")

kg1.txt file has been generated without duplicates, including 'interacts_with' relations.
trnMat1.pkl and tstMat1.pkl files have been generated with unique entries.


In [131]:

kg_file_path = "kg1.txt"


try:
    with open(kg_file_path, "r") as kg_file:
        print("Displaying the first 100 lines from kg.txt:\n")
        for i in range(100):
            line = kg_file.readline().strip()
            print(line)
        print("\nFile inspection complete.")
except FileNotFoundError:
    print("kg1.txt file not found.")

Displaying the first 100 lines from kg.txt:

30000 0 40000
30000 0 40001
30000 0 40002
30000 0 40003
30000 0 40004
30000 0 40005
30000 0 40006
30000 0 40007
30000 0 40008
30000 0 40009
30000 0 40010
30000 0 40011
30000 0 40012
30000 0 40013
30000 0 40014
30000 0 40015
30000 1 50000
30000 1 50001
30000 1 50002
30000 1 50003
30000 1 50004
30000 1 50005
30000 1 50006
30000 1 50007
30000 1 50008
30000 1 50009
30000 1 50010
30000 1 50011
30000 1 50013
30000 1 50015
30000 1 50016
30000 1 50017
30000 1 50018
30001 0 40001
30001 0 40002
30001 0 40003
30001 0 40004
30001 0 40005
30001 0 40006
30001 0 40007
30001 0 40008
30001 0 40009
30001 0 40010
30001 0 40011
30001 0 40012
30001 0 40013
30001 0 40014
30001 0 40015
30001 0 40016
30001 0 40017
30001 0 40018
30001 0 40019
30001 0 40020
30001 0 40021
30001 0 40022
30001 0 40023
30001 0 40024
30001 0 40025
30001 0 40026
30001 0 40027
30001 0 40028
30001 0 40029
30001 0 40030
30001 1 50000
30001 1 50001
30001 1 50002
30001 1 50003
30001 1 50004
300

In [128]:
import pickle
from scipy.sparse import coo_matrix

# Load and inspect the matrix in trnMat1.pkl with format handling
def load_and_inspect_trnMat(file_path):
    try:
        with open(file_path, "rb") as f:
            interaction_matrix = pickle.load(f)
        
        # Convert to COO format if not already in COO format
        if not isinstance(interaction_matrix, coo_matrix):
            interaction_matrix = coo_matrix(interaction_matrix)
        
        print(f"Matrix loaded from {file_path}:")
        print(f"Shape: {interaction_matrix.shape}")
        print(f"Number of non-zero entries: {interaction_matrix.nnz}")

        # Display some sample non-zero entries
        rows, cols, data = interaction_matrix.row, interaction_matrix.col, interaction_matrix.data
        print("Sample non-zero entries (row, col, data):")
        for i in range(min(100, len(data))):
            print(f"({rows[i]}, {cols[i]}, {data[i]})")
        
        print("\nFile inspection complete.")
    except FileNotFoundError:
        print(f"{file_path} file not found.")
    except AttributeError:
        print(f"Error: Matrix in {file_path} does not support .row, .col, .data attributes. Ensure it's in COO format.")
    except Exception as e:
        print(f"Error loading {file_path}: {e}")


load_and_inspect_trnMat("trnMat1.pkl")

Matrix loaded from trnMat1.pkl:
Shape: (5080, 7958)
Number of non-zero entries: 376885
Sample non-zero entries (row, col, data):
(0, 0, 1)
(0, 1, 1)
(0, 2, 1)
(0, 4, 1)
(0, 6, 1)
(0, 12, 1)
(0, 13, 1)
(0, 14, 1)
(0, 15, 1)
(0, 21, 1)
(0, 26, 1)
(0, 31, 1)
(0, 33, 1)
(0, 40, 1)
(0, 43, 1)
(0, 44, 1)
(0, 76, 1)
(0, 82, 1)
(0, 87, 1)
(0, 320, 1)
(0, 321, 1)
(0, 717, 1)
(0, 3000, 1)
(0, 3002, 1)
(0, 3003, 1)
(0, 3006, 1)
(0, 3012, 1)
(0, 3013, 1)
(0, 3014, 1)
(0, 3015, 1)
(0, 3021, 1)
(0, 3024, 1)
(0, 3026, 1)
(0, 3031, 1)
(0, 3038, 1)
(0, 3040, 1)
(0, 3043, 1)
(0, 3047, 1)
(0, 3057, 1)
(0, 3066, 1)
(0, 3081, 1)
(0, 3087, 1)
(0, 3088, 1)
(0, 3128, 1)
(0, 3151, 1)
(0, 3265, 1)
(0, 3405, 1)
(0, 3599, 1)
(0, 3693, 1)
(0, 3717, 1)
(0, 3960, 1)
(1, 0, 1)
(1, 1, 1)
(1, 2, 1)
(1, 3, 1)
(1, 4, 1)
(1, 6, 1)
(1, 7, 1)
(1, 8, 1)
(1, 9, 1)
(1, 12, 1)
(1, 13, 1)
(1, 14, 1)
(1, 15, 1)
(1, 16, 1)
(1, 18, 1)
(1, 21, 1)
(1, 22, 1)
(1, 28, 1)
(1, 30, 1)
(1, 31, 1)
(1, 33, 1)
(1, 34, 1)
(1, 36, 1)
(1, 37, 1)