Qdrant - bounswe/bounswe2025group5 GitHub Wiki

  • just provide the mysql password at the configuration part in the code , for installation:
    • Install asset for your pc from assets part of the link (v1.15.5 assets)
    • Extract the zip and from your terminal go to the path you have Qdrant exe
    • run qdrant exe ./qdrant , now it is running on your local
pip install mysql-connector-python==9.5.0 qdrant-client==1.15.1 sentence-transformers==5.1.2 torch==2.9.0
import mysql.connector
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, Distance, VectorParams
from sentence_transformers import SentenceTransformer
import sys
import time

# --- 1. CONFIGURATION ---
# Fill in your database details here

MYSQL_CONFIG = {
    'user': 'root',
    'password': 'our-mysql-password',
    'host': '35.232.41.48',
    'port': 3306,
    'database': 'waste_less'
}

# --- Qdrant Configuration ---
# If you are running this script on the same machine (VM) as your Qdrant container:
QDRANT_HOST = 'localhost'# It is localhost since you will use it for the local Qdrant

QDRANT_PORT = 6333  # Python client uses the HTTP port 6333 by default
QDRANT_COLLECTION = 'forum_posts'

# --- Model Configuration ---
MODEL_NAME = 'all-MiniLM-L6-v2'
VECTOR_DIMENSION = 384  


# --- 2. SCRIPT ---

def connect_to_services():
    """Establishes connections to Qdrant, MySQL, and loads the model."""
    try:
        print(f"Connecting to Qdrant at {QDRANT_HOST}:{QDRANT_PORT}...")
        qdrant_client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)

        print(f"Loading embedding model '{MODEL_NAME}'... (This may take a moment)")
        model = SentenceTransformer(MODEL_NAME)

        print(f"Connecting to MySQL at {MYSQL_CONFIG['host']}...")
        mysql_conn = mysql.connector.connect(**MYSQL_CONFIG)

        return qdrant_client, model, mysql_conn

    except Exception as e:
        print(f"Error during initialization: {e}")
        print("Please check all your connection details and ensure servers are running.")
        sys.exit(1)


def ensure_qdrant_collection(client):
    """Checks if the collection exists in Qdrant and creates it if not."""
    try:
        collections_response = client.get_collections()
        collection_names = [c.name for c in collections_response.collections]

        if QDRANT_COLLECTION not in collection_names:
            print(f"Collection '{QDRANT_COLLECTION}' not found. Creating it...")
            client.recreate_collection(
                collection_name=QDRANT_COLLECTION,
                vectors_config=VectorParams(size=VECTOR_DIMENSION, distance=Distance.COSINE)
            )
            print("Collection created.")
        else:
            print(f"Collection '{QDRANT_COLLECTION}' already exists.")

    except Exception as e:
        print(f"Error checking/creating Qdrant collection: {e}")
        sys.exit(1)


def fetch_posts_from_mysql(conn):
    """Fetches all post IDs and content from the MySQL database."""
    try:
        print("Fetching posts from MySQL...")
        with conn.cursor() as cursor:
            cursor.execute("SELECT post_id, content FROM posts")
            posts_to_process = cursor.fetchall()
        print(f"Found {len(posts_to_process)} posts in MySQL.")
        return posts_to_process

    except Exception as e:
        print(f"Error fetching from MySQL: {e}")
        sys.exit(1)


def backfill_embeddings():
    qdrant_client, model, mysql_conn = connect_to_services()
    ensure_qdrant_collection(qdrant_client)
    posts = fetch_posts_from_mysql(mysql_conn)

    points_to_upload = []
    new_embeddings_count = 0

    print("\n--- Starting Embedding Backfill ---")
    start_time = time.time()

    # We will fetch all existing IDs from Qdrant first for a faster check
    try:
        print("Fetching existing IDs from Qdrant to prevent duplicates...")
        # Scroll through all points and get just their IDs
        existing_ids = set()
        scroll_result = qdrant_client.scroll(
            collection_name=QDRANT_COLLECTION,
            limit=1000,  # Process 1000 at a time
            with_payload=False,
            with_vectors=False
        )
        while scroll_result[0]:  # while there are points
            for point in scroll_result[0]:
                existing_ids.add(point.id)
            next_page_offset = scroll_result[1]
            if not next_page_offset:
                break
            scroll_result = qdrant_client.scroll(
                collection_name=QDRANT_COLLECTION,
                limit=1000,
                offset=next_page_offset,
                with_payload=False,
                with_vectors=False
            )
        print(f"Found {len(existing_ids)} existing vectors in Qdrant.")

    except Exception as e:
        print(f"Warning: Could not fetch existing IDs from Qdrant ({e}). Will check one by one.")
        existing_ids = set()  # Fallback

    for (post_id, content) in posts:
        # Check if the ID is already in our set of existing IDs
        if post_id in existing_ids:
            # print(f"Skipping post {post_id}: Already in Qdrant.")
            continue

        # If not, create the embedding
        print(f"Embedding post {post_id}...")
        if not content or content.isspace():
            print(f"Skipping post {post_id}: Content is empty.")
            continue

        try:
            vector = model.encode(content).tolist()

            # Add to a batch for efficient uploading
            points_to_upload.append(
                PointStruct(
                    id=post_id,
                    vector=vector
                    # You could also add payload if you want
                    # payload={"content_snippet": content[:50]}
                )
            )
            new_embeddings_count += 1

            # Upload in batches of 10 (or more)
            if len(points_to_upload) >= 10:
                qdrant_client.upsert(
                    collection_name=QDRANT_COLLECTION,
                    points=points_to_upload,
                    wait=True
                )
                print(f"Uploaded batch of {len(points_to_upload)} vectors.")
                points_to_upload = []

        except Exception as e:
            print(f"Error embedding or uploading post {post_id}: {e}")

    # Upload any remaining points
    if points_to_upload:
        try:
            qdrant_client.upsert(
                collection_name=QDRANT_COLLECTION,
                points=points_to_upload,
                wait=True
            )
            print(f"Uploaded final batch of {len(points_to_upload)} vectors.")
        except Exception as e:
            print(f"Error during final batch upload: {e}")

    # 8. Cleanup
    end_time = time.time()
    print("\n--- Backfill Finished ---")
    print(f"Added {new_embeddings_count} new vectors.")
    print(f"Total time: {end_time - start_time:.2f} seconds.")
    mysql_conn.close()


if __name__ == "__main__":
    backfill_embeddings()