TutorialsJune 21, 20259 min read

Using LLM APIs for Data Analysis in Python: A Practical Guide (2025)

Use free LLM APIs to analyze datasets, generate insights, write pandas code, explain results, and build natural language query interfaces for your data.

LLMs as Data Analysis Assistants

The most valuable thing an LLM can do for data analysis is not do the analysis itself — it is to help you write better code faster, explain results clearly, identify patterns you might miss, and build conversational interfaces on top of your data. This guide shows practical patterns for all of these, using free API keys from FreeLLMKeys.

Setup

pip install openai pandas matplotlib seaborn

from openai import OpenAI
import pandas as pd
import json

client = OpenAI(
    base_url="https://aiapiv2.pekpik.com/v1",
    api_key="sk-your-freellmkeys-key"
)

Pattern 1 — Generate Analysis Code from Plain English

Describe what you want to do with your data, get back working pandas code:

def generate_analysis_code(df_description: str, task: str) -> str:
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": "You are an expert data analyst. Write clean, correct pandas code. Return only the code, no explanation."
            },
            {
                "role": "user",
                "content": f"""Dataset description:
{df_description}

Task: {task}

Write pandas code to accomplish this. Assume the DataFrame is already loaded as 'df'."""
            }
        ]
    )
    return response.choices[0].message.content

# Example usage
df_desc = """
Sales dataset with columns:
- date (datetime): sale date
- product (str): product name
- category (str): product category
- revenue (float): sale amount in USD
- region (str): US region (North, South, East, West)
- units_sold (int): number of units
"""

code = generate_analysis_code(df_desc, "Find the top 5 products by total revenue for Q4 2024, grouped by category")
print(code)
# Returns working pandas code you can execute directly

Pattern 2 — Explain Statistical Results in Plain English

def explain_results(analysis_output: str, context: str = "") -> str:
    response = client.chat.completions.create(
        model="claude-opus-4-7",  # Claude is best for explanation quality
        messages=[{
            "role": "user",
            "content": f"""Explain the following data analysis results in plain English.
Make it understandable for a non-technical business stakeholder.
Highlight the most important insights and any action items.

{f'Context: {context}' if context else ''}

Results:
{analysis_output}"""
        }]
    )
    return response.choices[0].message.content

# Example: explain a correlation matrix
df = pd.DataFrame({
    'sales':       [100, 150, 200, 130, 170],
    'ad_spend':    [10,  20,  35,  15,  25],
    'price':       [50,  50,  45,  50,  48],
    'temperature': [20,  25,  30,  22,  28],
})

corr = df.corr().round(3)
explanation = explain_results(str(corr), "Ice cream sales data for summer 2024")
print(explanation)

Pattern 3 — Natural Language Query Interface

Let users ask questions about data in plain English and get answers back:

def nl_query(df: pd.DataFrame, question: str) -> str:
    # Give the LLM a summary of the dataframe structure
    df_info = f"""
Shape: {df.shape}
Columns: {list(df.columns)}
Dtypes:
{df.dtypes.to_string()}

Sample (first 3 rows):
{df.head(3).to_string()}

Statistics:
{df.describe().to_string()}
"""

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": """You are a data analyst. When given a question about a dataset,
respond in two parts:
1. ANSWER: A direct answer to the question
2. CODE: The pandas code that produces this answer (if applicable)"""
            },
            {
                "role": "user",
                "content": f"Dataset info:\n{df_info}\n\nQuestion: {question}"
            }
        ]
    )
    return response.choices[0].message.content

# Interactive query interface
df = pd.read_csv("sales_data.csv")  # your data here

questions = [
    "Which month had the highest revenue?",
    "Is there a correlation between ad spend and sales?",
    "What percentage of revenue comes from the top 3 products?",
]

for q in questions:
    print(f"\nQ: {q}")
    print(nl_query(df, q))

Pattern 4 — Automatic Outlier Detection and Explanation

def explain_outliers(df: pd.DataFrame, column: str) -> str:
    Q1  = df[column].quantile(0.25)
    Q3  = df[column].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[column] < Q1 - 1.5*IQR) | (df[column] > Q3 + 1.5*IQR)]

    if outliers.empty:
        return f"No outliers detected in '{column}'"

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{
            "role": "user",
            "content": f"""I found {len(outliers)} outliers in the '{column}' column of a dataset.

Outlier values:
{outliers[column].to_string()}

Context of surrounding data:
Mean: {df[column].mean():.2f}
Std:  {df[column].std():.2f}
Min:  {df[column].min():.2f}
Max:  {df[column].max():.2f}

What could explain these outliers? Should they be removed or kept?"""
        }]
    )
    return response.choices[0].message.content

Pattern 5 — Generate Report Narratives

def generate_report(metrics: dict) -> str:
    response = client.chat.completions.create(
        model="claude-opus-4-7",
        messages=[{
            "role": "user",
            "content": f"""Write a concise executive summary (3-4 paragraphs) based on these metrics.
Use a professional business tone. Highlight wins, concerns, and recommendations.

Metrics:
{json.dumps(metrics, indent=2)}"""
        }]
    )
    return response.choices[0].message.content

report = generate_report({
    "period": "Q2 2025",
    "total_revenue": 2_450_000,
    "revenue_growth_yoy": "18.3%",
    "best_performing_product": "Pro Plan",
    "churn_rate": "3.2%",
    "new_customers": 847,
    "avg_deal_size": 2893,
    "top_region": "West Coast",
    "underperforming_region": "Midwest (down 12%)"
})
print(report)

Which Model to Use for Data Analysis

Code generation: GPT-4o or DeepSeek V3 — most accurate pandas/SQL code
Explaining results: Claude Opus 4 — most natural business-readable prose
Natural language queries: GPT-4o — best instruction following
Report writing: Claude Opus 4 or GPT-4o — both produce strong narrative

All models are available on the same FreeLLMKeys endpoint. Run pattern 1 with both GPT-4o and Claude on your actual data and see which produces better code for your specific dataset structure.

FreeLLMKeys Team

Building tools for the AI developer community

PreviousHow to Get a Free GPT-4 API Key Without a Credit Card in 2025 NextBest LLM for Coding in 2025: 7 Models Tested on Real-World Tasks