Qdrant - bounswe/bounswe2025group5 GitHub Wiki
- just provide the mysql password at the configuration part in the code , for installation:
- Install asset for your pc from assets part of the link (v1.15.5 assets)
- Extract the zip and from your terminal go to the path you have Qdrant exe
- run qdrant exe ./qdrant , now it is running on your local
pip install mysql-connector-python==9.5.0 qdrant-client==1.15.1 sentence-transformers==5.1.2 torch==2.9.0
import mysql.connector
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, Distance, VectorParams
from sentence_transformers import SentenceTransformer
import sys
import time
# --- 1. CONFIGURATION ---
# Fill in your database details here
MYSQL_CONFIG = {
'user': 'root',
'password': 'our-mysql-password',
'host': '35.232.41.48',
'port': 3306,
'database': 'waste_less'
}
# --- Qdrant Configuration ---
# If you are running this script on the same machine (VM) as your Qdrant container:
QDRANT_HOST = 'localhost'# It is localhost since you will use it for the local Qdrant
QDRANT_PORT = 6333 # Python client uses the HTTP port 6333 by default
QDRANT_COLLECTION = 'forum_posts'
# --- Model Configuration ---
MODEL_NAME = 'all-MiniLM-L6-v2'
VECTOR_DIMENSION = 384
# --- 2. SCRIPT ---
def connect_to_services():
"""Establishes connections to Qdrant, MySQL, and loads the model."""
try:
print(f"Connecting to Qdrant at {QDRANT_HOST}:{QDRANT_PORT}...")
qdrant_client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
print(f"Loading embedding model '{MODEL_NAME}'... (This may take a moment)")
model = SentenceTransformer(MODEL_NAME)
print(f"Connecting to MySQL at {MYSQL_CONFIG['host']}...")
mysql_conn = mysql.connector.connect(**MYSQL_CONFIG)
return qdrant_client, model, mysql_conn
except Exception as e:
print(f"Error during initialization: {e}")
print("Please check all your connection details and ensure servers are running.")
sys.exit(1)
def ensure_qdrant_collection(client):
"""Checks if the collection exists in Qdrant and creates it if not."""
try:
collections_response = client.get_collections()
collection_names = [c.name for c in collections_response.collections]
if QDRANT_COLLECTION not in collection_names:
print(f"Collection '{QDRANT_COLLECTION}' not found. Creating it...")
client.recreate_collection(
collection_name=QDRANT_COLLECTION,
vectors_config=VectorParams(size=VECTOR_DIMENSION, distance=Distance.COSINE)
)
print("Collection created.")
else:
print(f"Collection '{QDRANT_COLLECTION}' already exists.")
except Exception as e:
print(f"Error checking/creating Qdrant collection: {e}")
sys.exit(1)
def fetch_posts_from_mysql(conn):
"""Fetches all post IDs and content from the MySQL database."""
try:
print("Fetching posts from MySQL...")
with conn.cursor() as cursor:
cursor.execute("SELECT post_id, content FROM posts")
posts_to_process = cursor.fetchall()
print(f"Found {len(posts_to_process)} posts in MySQL.")
return posts_to_process
except Exception as e:
print(f"Error fetching from MySQL: {e}")
sys.exit(1)
def backfill_embeddings():
qdrant_client, model, mysql_conn = connect_to_services()
ensure_qdrant_collection(qdrant_client)
posts = fetch_posts_from_mysql(mysql_conn)
points_to_upload = []
new_embeddings_count = 0
print("\n--- Starting Embedding Backfill ---")
start_time = time.time()
# We will fetch all existing IDs from Qdrant first for a faster check
try:
print("Fetching existing IDs from Qdrant to prevent duplicates...")
# Scroll through all points and get just their IDs
existing_ids = set()
scroll_result = qdrant_client.scroll(
collection_name=QDRANT_COLLECTION,
limit=1000, # Process 1000 at a time
with_payload=False,
with_vectors=False
)
while scroll_result[0]: # while there are points
for point in scroll_result[0]:
existing_ids.add(point.id)
next_page_offset = scroll_result[1]
if not next_page_offset:
break
scroll_result = qdrant_client.scroll(
collection_name=QDRANT_COLLECTION,
limit=1000,
offset=next_page_offset,
with_payload=False,
with_vectors=False
)
print(f"Found {len(existing_ids)} existing vectors in Qdrant.")
except Exception as e:
print(f"Warning: Could not fetch existing IDs from Qdrant ({e}). Will check one by one.")
existing_ids = set() # Fallback
for (post_id, content) in posts:
# Check if the ID is already in our set of existing IDs
if post_id in existing_ids:
# print(f"Skipping post {post_id}: Already in Qdrant.")
continue
# If not, create the embedding
print(f"Embedding post {post_id}...")
if not content or content.isspace():
print(f"Skipping post {post_id}: Content is empty.")
continue
try:
vector = model.encode(content).tolist()
# Add to a batch for efficient uploading
points_to_upload.append(
PointStruct(
id=post_id,
vector=vector
# You could also add payload if you want
# payload={"content_snippet": content[:50]}
)
)
new_embeddings_count += 1
# Upload in batches of 10 (or more)
if len(points_to_upload) >= 10:
qdrant_client.upsert(
collection_name=QDRANT_COLLECTION,
points=points_to_upload,
wait=True
)
print(f"Uploaded batch of {len(points_to_upload)} vectors.")
points_to_upload = []
except Exception as e:
print(f"Error embedding or uploading post {post_id}: {e}")
# Upload any remaining points
if points_to_upload:
try:
qdrant_client.upsert(
collection_name=QDRANT_COLLECTION,
points=points_to_upload,
wait=True
)
print(f"Uploaded final batch of {len(points_to_upload)} vectors.")
except Exception as e:
print(f"Error during final batch upload: {e}")
# 8. Cleanup
end_time = time.time()
print("\n--- Backfill Finished ---")
print(f"Added {new_embeddings_count} new vectors.")
print(f"Total time: {end_time - start_time:.2f} seconds.")
mysql_conn.close()
if __name__ == "__main__":
backfill_embeddings()