Using LLM APIs for Data Analysis in Python: A Practical Guide (2025)
Use free LLM APIs to analyze datasets, generate insights, write pandas code, explain results, and build natural language query interfaces for your data.
LLMs as Data Analysis Assistants
The most valuable thing an LLM can do for data analysis is not do the analysis itself — it is to help you write better code faster, explain results clearly, identify patterns you might miss, and build conversational interfaces on top of your data. This guide shows practical patterns for all of these, using free API keys from FreeLLMKeys.
Setup
pip install openai pandas matplotlib seaborn
from openai import OpenAI
import pandas as pd
import json
client = OpenAI(
base_url="https://aiapiv2.pekpik.com/v1",
api_key="sk-your-freellmkeys-key"
)
Pattern 1 — Generate Analysis Code from Plain English
Describe what you want to do with your data, get back working pandas code:
def generate_analysis_code(df_description: str, task: str) -> str:
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": "You are an expert data analyst. Write clean, correct pandas code. Return only the code, no explanation."
},
{
"role": "user",
"content": f"""Dataset description:
{df_description}
Task: {task}
Write pandas code to accomplish this. Assume the DataFrame is already loaded as 'df'."""
}
]
)
return response.choices[0].message.content
# Example usage
df_desc = """
Sales dataset with columns:
- date (datetime): sale date
- product (str): product name
- category (str): product category
- revenue (float): sale amount in USD
- region (str): US region (North, South, East, West)
- units_sold (int): number of units
"""
code = generate_analysis_code(df_desc, "Find the top 5 products by total revenue for Q4 2024, grouped by category")
print(code)
# Returns working pandas code you can execute directly
Pattern 2 — Explain Statistical Results in Plain English
def explain_results(analysis_output: str, context: str = "") -> str:
response = client.chat.completions.create(
model="claude-opus-4-7", # Claude is best for explanation quality
messages=[{
"role": "user",
"content": f"""Explain the following data analysis results in plain English.
Make it understandable for a non-technical business stakeholder.
Highlight the most important insights and any action items.
{f'Context: {context}' if context else ''}
Results:
{analysis_output}"""
}]
)
return response.choices[0].message.content
# Example: explain a correlation matrix
df = pd.DataFrame({
'sales': [100, 150, 200, 130, 170],
'ad_spend': [10, 20, 35, 15, 25],
'price': [50, 50, 45, 50, 48],
'temperature': [20, 25, 30, 22, 28],
})
corr = df.corr().round(3)
explanation = explain_results(str(corr), "Ice cream sales data for summer 2024")
print(explanation)
Pattern 3 — Natural Language Query Interface
Let users ask questions about data in plain English and get answers back:
def nl_query(df: pd.DataFrame, question: str) -> str:
# Give the LLM a summary of the dataframe structure
df_info = f"""
Shape: {df.shape}
Columns: {list(df.columns)}
Dtypes:
{df.dtypes.to_string()}
Sample (first 3 rows):
{df.head(3).to_string()}
Statistics:
{df.describe().to_string()}
"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "system",
"content": """You are a data analyst. When given a question about a dataset,
respond in two parts:
1. ANSWER: A direct answer to the question
2. CODE: The pandas code that produces this answer (if applicable)"""
},
{
"role": "user",
"content": f"Dataset info:\n{df_info}\n\nQuestion: {question}"
}
]
)
return response.choices[0].message.content
# Interactive query interface
df = pd.read_csv("sales_data.csv") # your data here
questions = [
"Which month had the highest revenue?",
"Is there a correlation between ad spend and sales?",
"What percentage of revenue comes from the top 3 products?",
]
for q in questions:
print(f"\nQ: {q}")
print(nl_query(df, q))
Pattern 4 — Automatic Outlier Detection and Explanation
def explain_outliers(df: pd.DataFrame, column: str) -> str:
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df[column] < Q1 - 1.5*IQR) | (df[column] > Q3 + 1.5*IQR)]
if outliers.empty:
return f"No outliers detected in '{column}'"
response = client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": f"""I found {len(outliers)} outliers in the '{column}' column of a dataset.
Outlier values:
{outliers[column].to_string()}
Context of surrounding data:
Mean: {df[column].mean():.2f}
Std: {df[column].std():.2f}
Min: {df[column].min():.2f}
Max: {df[column].max():.2f}
What could explain these outliers? Should they be removed or kept?"""
}]
)
return response.choices[0].message.content
Pattern 5 — Generate Report Narratives
def generate_report(metrics: dict) -> str:
response = client.chat.completions.create(
model="claude-opus-4-7",
messages=[{
"role": "user",
"content": f"""Write a concise executive summary (3-4 paragraphs) based on these metrics.
Use a professional business tone. Highlight wins, concerns, and recommendations.
Metrics:
{json.dumps(metrics, indent=2)}"""
}]
)
return response.choices[0].message.content
report = generate_report({
"period": "Q2 2025",
"total_revenue": 2_450_000,
"revenue_growth_yoy": "18.3%",
"best_performing_product": "Pro Plan",
"churn_rate": "3.2%",
"new_customers": 847,
"avg_deal_size": 2893,
"top_region": "West Coast",
"underperforming_region": "Midwest (down 12%)"
})
print(report)
Which Model to Use for Data Analysis
- Code generation: GPT-4o or DeepSeek V3 — most accurate pandas/SQL code
- Explaining results: Claude Opus 4 — most natural business-readable prose
- Natural language queries: GPT-4o — best instruction following
- Report writing: Claude Opus 4 or GPT-4o — both produce strong narrative
All models are available on the same FreeLLMKeys endpoint. Run pattern 1 with both GPT-4o and Claude on your actual data and see which produces better code for your specific dataset structure.