semantic_sql_examples/cluster.py at main · aganse/semantic_sql_examples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import ast

import hdbscan
import numpy as np
import pandas as pd
from sklearn.decomposition import IncrementalPCA
from sklearn.preprocessing import normalize
from sqlalchemy import create_engine
import umap

import db_helper
from config import DB_URL, EMBED_TYPE


engine = create_engine(DB_URL)


###########################################################
# Phase 1: stream from Postgres into IncrementalPCA
# (Fit PCA using sequential partial fit - avoids loading all 100k rows at once.)
print("Phase 1...")

BATCH_SIZE = 5000
PCA_DIMS = 50  # Good intermediate compression before UMAP

ipca = IncrementalPCA(n_components=PCA_DIMS)

batchnum = 0

for batch_df in db_helper.fetch_embedding_chunks(
    engine,
    EMBED_TYPE,
    chunk_size=BATCH_SIZE
):
    X = np.vstack(batch_df["embedding"].apply(ast.literal_eval).values)
    X = normalize(X)

    ipca.partial_fit(X)

    batchnum += 1

    print("pca-fit batch", batchnum)


###########################################################
# Phase 2: transform all rows through PCA
# (Now that we have that PCA transform fitted, use that on re-pulled data to
# create one global reduced matrix for the UMAP in phase 3.)
print("Phase 2...")

all_ids = []
all_tags = []
all_vectors = []

batchnum = 0

for batch_df in db_helper.fetch_embedding_chunks(
    engine,
    EMBED_TYPE,
    chunk_size=BATCH_SIZE
):

    X = np.vstack(
        batch_df["embedding"].apply(ast.literal_eval).values
    )
    X = normalize(X)

    X_pca = ipca.transform(X)

    all_vectors.append(X_pca)
    all_ids.extend(batch_df["id"].tolist())
    all_tags.extend(batch_df["tag"].tolist())

    batchnum += 1
    print("pca-transformed batch", batchnum)

X_pca_all = np.vstack(all_vectors)

print("~100k x 50 matrix X_pca_all stacked now; actual size:", X_pca_all.shape)

# Now we have 100k × 50 instead of 100k × 768; more likely to fit in memory


###########################################################
# Phase 3: run UMAP globally on the PCA-reduced 100k × 50 matrix
# ("At 100k × 50 this is usually very feasible."  We'll see!)
print("Phase 3...")

# Final dimensionality for clustering
# Try:
#   10  -> tighter/local structure
#   30  -> more global structure preserved
umap_dims = 10

umap_model = umap.UMAP(
    n_components=umap_dims,
    n_neighbors=15,
    min_dist=0.0,
    metric="cosine",
    random_state=42
)

X_umap = umap_model.fit_transform(X_pca_all)
print("UMAP model fitted.")


###########################################################
# Phase 4: run HDBSCAN globally on the UMAP-reduced 100k x 10 matrix
print("Phase 4...")

clusterer = hdbscan.HDBSCAN(
    min_cluster_size=25,  # earlier was 10
    min_samples=10,       # earlier was 5
    metric="euclidean",   # could be "cosine", eg if no dim-red
    prediction_data=True
)

cluster_labels = clusterer.fit_predict(X_umap)
print("cluster_labels fitted/assigned via HDBSCAN.")

print("\n counts per cluster:")
cluster_series = pd.Series(cluster_labels)
print(cluster_series.value_counts().sort_index())


###########################################################
# Phase 5: write cluster labels back to embeddings_768.tag
print("Phase 5...")

with engine.begin() as conn:
    for embed_id, cluster_label in zip(all_ids, cluster_labels):
        db_helper.insert_embedding_tag(
            conn,
            embed_id,
            {"cluster": int(cluster_label)}
        )

print("cluster labels written back to embeddings_768.tag")


# HDBSCAN uses:
#   -1 => noise/outlier
#    0,1,2,... => clusters

# df["cluster"] = cluster_labels
# print(df["cluster"].value_counts().sort_index())