gather milvus collection creations into initialization function

5 months ago · 511f05af83
parent cccc1f0877
commit 511f05af83
1 changed files with 401 additions and 285 deletions
--- a/database/milvus.ts
+++ b/database/milvus.ts
@ -9,15 +9,27 @@ const ssl = true; // secure or not
 // connect to milvus
 export const client = new MilvusClient({ address, ssl, username, password });

+async function initialize() {
+  /** List existing Collections to determine what we have to create.*/
+  const collections = await client.listCollections();
+  if (collections.status.code !== 0) {
+    console.log(collections.status.error_code);
+    console.log(collections.status.reason);
+    return;
+  }
+  const collectionNames = collections.data.map((c) => c.name);
+
+  /** Create collections if they don't exist. */
+  if (!collectionNames.includes("facts")) {
+    console.log("Creating collection: facts");
    await client.createCollection({
      collection_name: "facts",
+      auto_id: true,
      fields: [
        {
          name: "id",
          data_type: DataType.Int64,
          is_primary_key: true,
-      //@ts-ignore
-      type_params: { auto_id: true },
        },
        {
          name: "user_id",
@ -35,12 +47,15 @@ await client.createCollection({
          name: "fact_content",
          data_type: DataType.VarChar,
          description: `The actual natural language statement of the fact (e.g., "The user likes pizza."). This is what you'll inject into the LLM context.`,
+          type_params: {
+            max_length: "10240",
+          },
        },
        {
          name: "fact_embedding",
          data_type: DataType.FloatVector,
          type_params: {
-        dim: 3072,
+            dim: "3072",
          },
          description: `Vector for the fact_content itself. This allows you to retrieve facts based on semantic similarity to a user's new statement (e.g., user asks "What do I like?", you embed that and search fact_embedding).`,
        },
@ -48,6 +63,7 @@ await client.createCollection({
        { name: "created_at", data_type: DataType.Int64 },
      ],
    });
+  }

  /** Stores the semantic contexts (trigger_embeddings) for when a specific Fact
   * should be injected. This allows for many-to-one relationship (many triggers
@ -58,10 +74,17 @@ await client.createCollection({
   * situations in which the Facts should be invoked, each with a different
   * priority_multiplier.
   */
+  if (!collectionNames.includes("fact_triggers")) {
+    console.log("Creating collection: fact_triggers");
    await client.createCollection({
      collection_name: "fact_triggers",
+      auto_id: true,
      fields: [
-    { name: "id", data_type: DataType.Int64, is_primary_key: true },
+        {
+          name: "id",
+          data_type: DataType.Int64,
+          is_primary_key: true,
+        },
        {
          name: "fact_id",
          data_type: DataType.Int64,
@ -72,12 +95,15 @@ await client.createCollection({
          name: "trigger_phrase",
          data_type: DataType.VarChar,
          description: `The natural language phrase describing the trigger (e.g., "When food preferences are discussed"). Useful for context and debugging during trigger generation.`,
+          type_params: {
+            max_length: "10240",
+          },
        },
        {
          name: "trigger_phrase_embedding",
          data_type: DataType.FloatVector,
          type_params: {
-        dim: 3072,
+            dim: "3072",
          },
          description: `The vector representation of the trigger phrase. This is what you'll query with the current conversation context, with a large/smart LLM.`,
        },
@ -92,6 +118,9 @@ await client.createCollection({
          data_type: DataType.VarChar,
          description:
            "The natural-language reason for why this FactTrigger was given its priority multiplier. We store the reason here so that we can easily explain it to the user; but more importantly, the same FactTrigger might have different priorities in different situations that are only discovered from subsequent user input. In this case, we don't want to update the existing FactTrigger's priority multiplier, but instead create a new FactTrigger with a new priority multiplier, with different trigger_phrase and different scope_priority_multiplier_reason.",
+          type_params: {
+            max_length: "10240",
+          },
        },
        {
          name: "scope_conversation_id",
@ -103,13 +132,21 @@ await client.createCollection({
        { name: "created_at", data_type: DataType.Int64 },
      ],
    });
+  }

  /** Stores metadata about the conversations themselves. Right now, it's only the
   * title and creation time. */
+  if (!collectionNames.includes("conversations")) {
+    console.log("Creating collection: conversations");
    await client.createCollection({
      collection_name: "conversations",
+      auto_id: true,
      fields: [
-    { name: "id", data_type: DataType.Int64, is_primary_key: true },
+        {
+          name: "id",
+          data_type: DataType.Int64,
+          is_primary_key: true,
+        },
        {
          name: "user_id",
          data_type: DataType.Int64,
@ -121,6 +158,16 @@ await client.createCollection({
          data_type: DataType.VarChar,
          description:
            "The title of the conversation, for friendly display in the UI.",
+          type_params: {
+            max_length: "1024",
+          },
+        },
+        {
+          name: "vector",
+          data_type: DataType.FloatVector,
+          type_params: { dim: "2" },
+          description:
+            "A bogus vector field so Milvus lets us create the collection (every collection needs at least one vector field).",
        },
        {
          name: "created_at",
@ -129,15 +176,23 @@ await client.createCollection({
        },
      ],
    });
+  }

  /** To store the full history of your conversations, enabling summarization, context reconstruction, and traceability.
   *
   * Purpose: To persist the raw chat history for each user.
   */
+  if (!collectionNames.includes("conversation_messages")) {
+    console.log("Creating collection: conversation_messages");
    await client.createCollection({
      collection_name: "conversation_messages",
+      auto_id: true,
      fields: [
-    { name: "id", data_type: DataType.Int64, is_primary_key: true },
+        {
+          name: "id",
+          data_type: DataType.Int64,
+          is_primary_key: true,
+        },
        {
          name: "user_id",
          data_type: DataType.Int64,
@ -160,18 +215,24 @@ await client.createCollection({
          name: "role",
          data_type: DataType.VarChar,
          description: `"user" or "assistant". This is what you'll inject into the LLM context.`,
+          type_params: {
+            max_length: "64",
+          },
        },
        {
          name: "message_content",
          data_type: DataType.VarChar,
          description:
            "The raw text of the message. This is what you'll inject into the LLM context.",
+          type_params: {
+            max_length: "65535", // 64KB
+          },
        },
        {
          name: "message_embedding",
          data_type: DataType.FloatVector,
          type_params: {
-        dim: 3072,
+            dim: "3072",
          },
          description:
            "Embedding of the message content. Can be used for semantic search within chat history or for generating conversation summary embeddings.",
@ -181,73 +242,39 @@ await client.createCollection({
          data_type: DataType.VarChar,
          description:
            "The raw text of a running summary of the conversation until this message.",
+          type_params: {
+            max_length: "65535", // 64KB
+          },
        },
        {
          name: "running_summary_embedding",
          data_type: DataType.FloatVector,
          type_params: {
-        dim: 3072,
+            dim: "3072",
          },
          description:
            "(Optional but useful) Embedding of the running summary. Can be used for semantic search within chat history.",
        },
-    {
-      name: "timestamp",
-      data_type: DataType.Int64,
-      description:
-        "Unix timestamp when the message was sent/received. This is what you'll inject into the LLM context.",
-    },
-  ],
-});
-
-/**
- * To store basic user information. Even if you only have one user initially,
- * it's good practice to structure for multi-tenancy.
- *
- * Purpose: To store basic information about each registered user.
- */
-await client.createCollection({
-  collection_name: "users",
-  fields: [
-    { name: "id", data_type: DataType.Int64, is_primary_key: true },
-    {
-      name: "username",
-      data_type: DataType.VarChar,
-      description: "User's chosen username.",
-    },
-    {
-      name: "password",
-      data_type: DataType.VarChar,
-      description: "(Optional) User's password.",
-    },
-    {
-      name: "email",
-      data_type: DataType.VarChar,
-      description: "(Optional) User's email address.",
-    },
        {
          name: "created_at",
          data_type: DataType.Int64,
-      description: "Unix timestamp of user creation.",
-    },
-    {
-      name: "last_login",
-      data_type: DataType.Int64,
-      description: "Unix timestamp of last user login.",
+          description: "Unix timestamp when the message was sent/received.",
        },
      ],
    });
+  }

+  if (!collectionNames.includes("tools")) {
+    console.log("Creating collection: tools");
    await client.createCollection({
      collection_name: "tools",
+      auto_id: true,
      fields: [
        /** Primary key, unique identifier for each fact. */
        {
          name: "id",
          data_type: DataType.Int64,
          is_primary_key: true,
-      //@ts-ignore
-      type_params: { auto_id: true },
        },
        /** Foreign key linking to the Users Collection.
         * Crucial if you have multiple users. */
@ -270,11 +297,17 @@ await client.createCollection({
          name: "name",
          data_type: DataType.VarChar,
          description: `The name of the tool (e.g., "weather").`,
+          type_params: {
+            max_length: "64",
+          },
        },
        {
          name: "description",
          data_type: DataType.VarChar,
          description: `The description of the tool (e.g., "Get the current weather in a given location.").`,
+          type_params: {
+            max_length: "10240",
+          },
        },
        {
          name: "parameter_schema",
@ -285,20 +318,103 @@ await client.createCollection({
          name: "implementation_language",
          data_type: DataType.VarChar,
          description: `The language of the tool (e.g., "Python"). This is so we know how to execute the tool's implementation code.`,
+          type_params: {
+            max_length: "128",
+          },
        },
-
        {
          name: "implementation_code",
          data_type: DataType.VarChar,
          description: `The actual code that implements the tool (e.g., "def get_weather(location): return 'Sunny'").`,
+          type_params: {
+            max_length: "65535", // 64KB
+          },
+        },
+        {
+          name: "implementation_embedding",
+          data_type: DataType.FloatVector,
+          type_params: {
+            dim: "3072",
+          },
+          description: `The embedding of the tool's implementation code.`,
        },
        /** Unix timestamp of when the fact was extracted/created. */
        { name: "created_at", data_type: DataType.Int64 },
      ],
    });
+  }

  /** I'm still not sure if it's useful to have a `tool_trigger` collection. */

  /** TODO: How do I store Agents? Should I store Agents, since the user is
   * supposed to interact it but one "entity"? Is it even useful, since each LLM
   * call will contain what it needs? */
+
+  /** TODO: Not creating `users` collection yet, because the Zilliz Free tier
+   * maxes out at 5 collections. */
+  // /**
+  //  * To store basic user information. Even if you only have one user initially,
+  //  * it's good practice to structure for multi-tenancy.
+  //  *
+  //  * Purpose: To store basic information about each registered user.
+  //  */
+  // if (!collectionNames.includes("users")) {
+  //   console.log("Creating collection: users");
+  //   await client.createCollection({
+  //     collection_name: "users",
+  //     auto_id: true,
+  //     fields: [
+  //       {
+  //         name: "id",
+  //         data_type: DataType.Int64,
+  //         is_primary_key: true,
+  //       },
+  //       {
+  //         name: "username",
+  //         data_type: DataType.VarChar,
+  //         description: "User's chosen username.",
+  //         type_params: {
+  //           max_length: "64",
+  //         },
+  //       },
+  //       {
+  //         name: "password",
+  //         data_type: DataType.VarChar,
+  //         description: "User's password.",
+  //         type_params: {
+  //           max_length: "256",
+  //         },
+  //       },
+  //       {
+  //         name: "email",
+  //         data_type: DataType.VarChar,
+  //         description: "User's email address.",
+  //         type_params: {
+  //           max_length: "128",
+  //         },
+  //       },
+  //       {
+  //         name: "vector",
+  //         data_type: DataType.FloatVector,
+  //         type_params: {
+  //           dim: "2",
+  //         },
+  //         description:
+  //           "A bogus vector field so Milvus lets us create the collection (every collection needs at least one vector field).",
+  //       },
+  //       {
+  //         name: "last_login",
+  //         data_type: DataType.Int64,
+  //         description: "Unix timestamp of last user login.",
+  //       },
+  //       {
+  //         name: "created_at",
+  //         data_type: DataType.Int64,
+  //         description: "Unix timestamp of user creation.",
+  //       },
+  //     ],
+  //   });
+  // }
+}
+
+await initialize();