View Notebook on Github

Multi-Tool Orchestration with RAG approach using OpenAI

This cookbook guides you through building dynamic, multi-tool workflows using OpenAI’s APIs. It demonstrates how to implement a Retrieval-Augmented Generation (RAG) approach that intelligently routes user queries to the appropriate in-built or external tools. Whether your query calls for general knowledge or requires accessing specific internal context from a vector database (like Pinecone), this guide shows you how to integrate function calls, simulate web searches, and leverage document retrieval to generate accurate, context-aware responses.

Installation

Install required dependencies:

pip install agentops openai datasets tqdm pandas pinecone-client python-dotenv

Setup

import os
from dotenv import load_dotenv
import time
from tqdm.auto import tqdm # tqdm.auto for notebook compatibility
from pandas import DataFrame
from datasets import load_dataset
import random
import string
from openai import OpenAI
import agentops
from pinecone import Pinecone # Pinecone class for client
from pinecone import ServerlessSpec
import json # For handling tool arguments

Set API Keys:

load_dotenv()

os.environ["AGENTOPS_API_KEY"] = os.getenv("AGENTOPS_API_KEY", "your_agentops_api_key_here")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY", "your_openai_api_key_here")
os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY", "your_pinecone_api_key_here")

Initialize AgentOps and OpenAI client:

agentops.init(auto_start_session=True) 
tracer = agentops.start_trace(trace_name="Multi-Tool Orchestration with RAG", tags=["multi-tool-orchestration-rag-demo", "openai-responses", "agentops-example"])
client = OpenAI()

In this example we use a sample medical reasoning dataset from Hugging Face. We convert the dataset into a Pandas DataFrame and merge the “Question” and “Response” columns into a single string. This merged text is used for embedding and later stored as metadata.

# Load the dataset (ensure you're logged in with huggingface-cli if needed)
ds = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT", "en", split="train[:100]", trust_remote_code=True)
ds_dataframe = DataFrame(ds)

# Merge the Question and Response columns into a single string.
ds_dataframe["merged"] = ds_dataframe.apply(
    lambda row: f"Question: {row['Question']} Answer: {row['Response']}", axis=1
)
print("Example merged text:", ds_dataframe["merged"].iloc[0])
# To display the dataframe in a notebook-like fashion (optional in MDX)
# ds_dataframe 

Create a Pinecone Index Based on the Dataset

Use the dataset itself to determine the embedding dimensionality. For example, compute one embedding from the merged column and then create the index accordingly.

MODEL = "text-embedding-3-small"  # Replace with your production embedding model if needed
# Compute an embedding for the first document to obtain the embedding dimension.
sample_embedding_resp = client.embeddings.create(input=[ds_dataframe["merged"].iloc[0]], model=MODEL)
embed_dim = len(sample_embedding_resp.data[0].embedding)
print(f"Embedding dimension: {embed_dim}")
# Initialize Pinecone using your API key.
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])

# Define the Pinecone serverless specification.
AWS_REGION = "us-east-1" # As in notebook
spec = ServerlessSpec(cloud="aws", region=AWS_REGION)

# Create a random index name with lower case alphanumeric characters and '-'
index_name = "pinecone-index-" + "".join(random.choices(string.ascii_lowercase + string.digits, k=10))

# Create the index if it doesn't already exist.
if index_name not in pc.list_indexes().names:
    pc.create_index(index_name, dimension=embed_dim, metric="dotproduct", spec=spec)
    # Wait for index to be ready (important)
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# Connect to the index.
index = pc.Index(index_name)
# time.sleep(1) # Notebook has this, usually not needed if create_index waits.
print("Index stats:", index.describe_index_stats())

Upsert the Dataset into Pinecone index

Process the dataset in batches, generate embeddings for each merged text, prepare metadata (including separate Question and Answer fields), and upsert each batch into the index.

batch_size = 32
for i in tqdm(range(0, len(ds_dataframe["merged"]), batch_size), desc="Upserting to Pinecone"):
    i_end = min(i + batch_size, len(ds_dataframe["merged"]))
    lines_batch = ds_dataframe["merged"][i:i_end].tolist() # Ensure it's a list of strings
    ids_batch = [str(n) for n in range(i, i_end)]

    # Create embeddings for the current batch.
    res = client.embeddings.create(input=lines_batch, model=MODEL) # Pass list directly
    embeds = [record.embedding for record in res.data]

    # Prepare metadata by extracting original Question and Answer.
    meta = []
    # Notebook uses: for record in ds_dataframe.iloc[i:i_end].to_dict("records"):
    for record_idx in range(i, i_end): 
        original_record = ds_dataframe.iloc[record_idx]
        q_text = original_record["Question"]
        a_text = original_record["Response"]
        meta.append({"Question": q_text, "Answer": a_text})

    # Upsert the batch into Pinecone.
    vectors = list(zip(ids_batch, embeds, meta))
    index.upsert(vectors=vectors)

Query the Pinecone Index

Create a natural language query, compute its embedding, and perform a similarity search on the Pinecone index. The returned results include metadata that provides context for generating answers.

def query_pinecone_index(client, index, model, query_text):
    # Generate an embedding for the query.
    # Notebook: client.embeddings.create(input=query_text, ...) 
    # Corrected: input should be a list
    query_embedding = client.embeddings.create(input=[query_text], model=model).data[0].embedding

    # Query the index and return top 5 matches.
    # Notebook: index.query(vector=[query_embedding], ...)
    # Corrected: vector should be the embedding itself if it's already a list/vector
    res = index.query(vector=query_embedding, top_k=5, include_metadata=True) 
    print("Query Results:")
    for match in res["matches"]:
        print(
            f"{match['score']:.2f}: {match['metadata'].get('Question', 'N/A')} - {match['metadata'].get('Answer', 'N/A')}"
        )
    return res
# Example usage with a different query from the train/test set
query = (
    "A 45-year-old man with a history of alcohol use presents with symptoms including confusion, ataxia, and ophthalmoplegia. "
    "What is the most likely diagnosis and the recommended treatment?"
)
query_pinecone_index(client, index, MODEL, query)

Generate a Response Using the Retrieved Context

Select the best matching result from your query results and use the OpenAI Chat Completions API to generate a final answer by combining the retrieved context with the original question.

# Retrieve and concatenate top 3 match contexts.
# Notebook: client.embeddings.create(input=query, ...)
# Corrected: input should be a list
query_embedding_for_rag = client.embeddings.create(input=[query], model=MODEL).data[0].embedding
matches = index.query(
    vector=query_embedding_for_rag, top_k=3, include_metadata=True # vector is embedding itself
)["matches"]

context = "\n\n".join(
    f"Question: {m['metadata'].get('Question', '')}\nAnswer: {m['metadata'].get('Answer', '')}" for m in matches
)

# Use the context to generate a final answer using Chat Completions.
response_payload = {
    "model": "gpt-4o", 
    "messages": [
        {"role": "system", "content": "You are a helpful medical assistant. Answer based on the provided context."},
        {"role": "user", "content": f"Provide the answer based on the context: {context} and the question: {query} as per the internal knowledge base"}
    ]
}
completion = client.chat.completions.create(**response_payload)
final_answer_text = completion.choices[0].message.content
print("\nFinal Answer (using Chat Completions):")
print(final_answer_text)

Orchestrate Multi-Tool Calls

Now, we’ll define tools. One simulates web search, and the other queries our Pinecone vector store.

# Define available tools.
tools = [
    {
        "type": "function", # Standard type for custom functions
        "function": {
            "name": "web_search_preview_tool", # Giving it a function-like name
            "description": "Performs a web search and returns a preview of results. Use for real-time or up-to-date information.",
            "parameters": { # Even simulated tools can define expected parameters
                "type": "object",
                "properties": {
                    "search_query": {"type": "string", "description": "The query to search the web for."}
                },
                "required": ["search_query"]
            }
        }
    },
    {
        "type": "function",
        "function": { 
            "name": "PineconeSearchDocuments",
            "description": "Search for relevant documents based on the medical question asked by the user that is stored within the vector database using a semantic query.",
            "parameters": {
                "type": "object",
                "properties": {
                    "query": {"type": "string", "description": "The natural language query to search the vector database."},
                    "top_k": {"type": "integer", "description": "Number of top results to return.", "default": 3},
                },
                "required": ["query"],
            }
        }
    },
]
# Example queries that the model should route appropriately.
queries = [
    {"query": "Who won the cricket world cup in 1983?"},
    {"query": "What is the most common cause of death in the United States according to the internet?"},
    {
        "query": (
            "A 7-year-old boy with sickle cell disease is experiencing knee and hip pain, "
            "has been admitted for pain crises in the past, and now walks with a limp. "
            "His exam shows a normal, cool hip with decreased range of motion and pain with ambulation. "
            "What is the most appropriate next step in management according to the internal knowledge base?"
        )
    },
]
# Process each query dynamically using Chat Completions for tool routing.
for item in queries:
    current_messages = [{"role": "user", "content": item["query"]}]
    print("\n🌟--- Processing Query ---🌟")
    print(f"🔍 **User Query:** {item['query']}")

    # First call to LLM to decide on tool usage
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": "When prompted with a question, select the right tool to use based on the question. If a web search is needed, use 'web_search_preview_tool'. If internal knowledge base search is needed, use 'PineconeSearchDocuments'.",
            },
            {"role": "user", "content": item["query"]},
        ],
        tools=tools,
        tool_choice="auto", 
    )

    response_message = response.choices[0].message
    tool_calls = response_message.tool_calls
    
    current_messages.append(response_message) # Add assistant's decision to message history

    print("\n✨ **Initial LLM Response:**")
    if response_message.content:
        print(response_message.content)
    
    if tool_calls:
        print(f"🔧 **Model triggered tool call(s):**")
        
        for tool_call in tool_calls:
            function_name = tool_call.function.name
            function_args = json.loads(tool_call.function.arguments)
            
            print(f"  - Tool: {function_name}, Args: {function_args}")
            function_response_content = ""

            if function_name == "PineconeSearchDocuments":
                print("🔍 **Invoking PineconeSearchDocuments tool...**")
                pinecone_query = function_args.get("query")
                top_k_results = function_args.get("top_k", 3)
                # query_pinecone_index returns the full response object
                pinecone_response_obj = query_pinecone_index(client, index, MODEL, pinecone_query) 
                
                matches_str_list = []
                if pinecone_response_obj["matches"]:
                    for match in pinecone_response_obj["matches"][:top_k_results]:
                        matches_str_list.append(f"Score: {match['score']:.2f}, Q: {match['metadata'].get('Question', 'N/A')}, A: {match['metadata'].get('Answer', 'N/A')}")
                function_response_content = "\n".join(matches_str_list) if matches_str_list else "No matching documents found."
                print("✅ **PineconeSearchDocuments tool invoked successfully.**")
                
            elif function_name == "web_search_preview_tool": 
                print("🔍 **Simulating web_search_preview_tool...**")
                search_query_arg = function_args.get("search_query", item['query'])
                function_response_content = f"Simulated web search result for: {search_query_arg}"
                print("✅ **Simulated web_search_preview_tool invoked successfully.**")
            else:
                 print(f"⚠️ Unknown tool: {function_name}")
                 function_response_content = "Error: Unknown tool."

            current_messages.append(
                {
                    "tool_call_id": tool_call.id,
                    "role": "tool",
                    "name": function_name,
                    "content": function_response_content,
                }
            )
        
        # Second call to LLM with tool results
        print("\n🔄 **Getting final response from LLM...**")
        final_response = client.chat.completions.create(
            model="gpt-4o", 
            messages=current_messages
        )
        print("💡 **Final Answer:**")
        print(final_response.choices[0].message.content)
    elif response_message.content:
        # If no tool call is triggered, print the response directly.
        print("💡 **Final Answer (no tool call):**")
        print(response_message.content)
    else:
        print("⚠️ No content or tool call in response.")

As shown above, depending on the query, appropriate tool is invoked in order to determine the optimal response.

For instance, looking at the third example, when the model triggers the tool named “PineconeSearchDocuments”, the code calls query_pinecone_index with the current query and then extracts the best match (or an appropriate context) as the result. For non health related inquiries or queries where explicit internet search is asked, the code calls the (simulated) web_search_preview_tool function and for other queries, it may choose to not call any tool and rather provide a response based on the question under consideration.

Finally, the tool call and its output are appended to the conversation, and the final answer is generated by the Chat Completions API.

Multi-tool orchestration flow (Sequential Example)

Now let us try to modify the input query and the system instructions to the Chat Completions API in order to follow a tool calling sequence and generate the output.

# Process one query as an example to understand the tool calls and function calls as part of the response output
item_query_seq = "What is the most common cause of death in the United States"

# Initialize input messages with the user's query.
current_messages_seq = [{"role": "user", "content": item_query_seq}]
print("\n🌟--- Processing Query (Sequential Tool Call Example) ---🌟")
print(f"🔍 **User Query:** {item_query_seq}")

# Call the Chat Completions API with tools enabled
print("\n🔧 **Calling Chat Completions API with Tools Enabled**")
print("\n🕵️‍♂️ **Step 1: Web Search Call (Implicit)**")
print("   - Initiating web search to gather initial information.")
print("\n📚 **Step 2: Pinecone Search Call (Implicit)**")
print("   - Querying Pinecone to find relevant examples from the internal knowledge base.")

# First LLM call to determine initial tool usage
response_seq_1 = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {
            "role": "system",
            "content": "First, call the web_search_preview_tool for results. Then, using that information, call `PineconeSearchDocuments` to find real examples in the internal knowledge base. Finally, synthesize an answer.",
        },
        {"role": "user", "content": item_query_seq},
    ],
    tools=tools,
    tool_choice="auto", # Let model decide, or force a specific tool if needed for strict sequence
)

print("\n✨ **Initial Response Output (after first LLM call):**")
response_message_seq_1 = response_seq_1.choices[0].message
tool_calls_seq_1 = response_message_seq_1.tool_calls

if response_message_seq_1.content:
    print(f"Assistant Content: {response_message_seq_1.content}")
current_messages_seq.append(response_message_seq_1) 

if tool_calls_seq_1:
    print("🔧 **Tool calls from first LLM response:**")
    for tc in tool_calls_seq_1:
        print(f"  - ID: {tc.id}, Function: {tc.function.name}, Args: {tc.function.arguments}")
        tool_function_args = json.loads(tc.function.arguments)
        tool_response_content = ""
        
        if tc.function.name == "web_search_preview_tool":
            simulated_web_search_output = f"Simulated web search result for query: {tool_function_args.get('search_query', item_query_seq)}"
            tool_response_content = simulated_web_search_output
            print(f"  ✔️ Simulated output for {tc.function.name}: {tool_response_content}")
        elif tc.function.name == "PineconeSearchDocuments":
            pinecone_query_text = tool_function_args.get("query", item_query_seq) 
            pinecone_res_obj = query_pinecone_index(client, index, MODEL, pinecone_query_text)
            matches_str_list_seq = []
            if pinecone_res_obj["matches"]:
                for match in pinecone_res_obj["matches"][:tool_function_args.get("top_k", 3)]:
                    matches_str_list_seq.append(f"Score: {match['score']:.2f}, Q: {match['metadata'].get('Question', 'N/A')}, A: {match['metadata'].get('Answer', 'N/A')}")
            tool_response_content = "\n".join(matches_str_list_seq) if matches_str_list_seq else "No matching documents found in Pinecone."
            print(f"  ✔️ Output for {tc.function.name}: {tool_response_content[:100]}...")
        
        current_messages_seq.append({
            "tool_call_id": tc.id,
            "role": "tool",
            "name": tc.function.name,
            "content": tool_response_content,
        })

    # Second call to LLM with tool results
    print("\n🔄 **Calling Chat Completions API again with tool results...**")
    response_seq_2 = client.chat.completions.create(
        model="gpt-4o",
        messages=current_messages_seq,
        tools=tools # Make tools available again if model decides to call another one
    )
    final_answer_seq = response_seq_2.choices[0].message.content
    print("\n💡 **Final Answer (after sequential tool calls):**")
    print(final_answer_seq)

else:
    print("⚠️ No tool calls in the first response. Final answer is the initial content.")
    if response_message_seq_1.content:
        print(response_message_seq_1.content)

agentops.end_trace(tracer, end_state="Success") # End the main trace

Here, we have seen how to utilize OpenAI’s Chat Completions API to implement a Retrieval-Augmented Generation (RAG) approach with multi-tool calling capabilities. It showcases an example where the model selects the appropriate tool based on the input query: general questions may be handled by built-in tools such as web-search (simulated here), while specific medical inquiries related to internal knowledge are addressed by retrieving context from a vector database (such as Pinecone) via function calls. Additionally, we have showcased how multiple tool calls can be conceptually chained to generate a final response based on our instructions. Happy coding!