DataQL

Examples

Real-world examples of using DataQL for various data processing tasks.

Data Analysis

Sales Analysis

Analyze sales data to find top products and revenue:

# Create sample sales data
cat > sales.csv << 'EOF'
date,product,category,quantity,price,region
2024-01-15,Laptop,Electronics,5,999.99,North
2024-01-15,Mouse,Electronics,20,29.99,North
2024-01-16,Desk,Furniture,3,299.99,South
2024-01-16,Chair,Furniture,8,199.99,South
2024-01-17,Monitor,Electronics,10,399.99,East
2024-01-17,Keyboard,Electronics,15,89.99,West
2024-01-18,Laptop,Electronics,3,999.99,South
2024-01-18,Desk,Furniture,2,299.99,North
EOF

# Total revenue by category
dataql run -f sales.csv -q "
SELECT
    category,
    SUM(quantity * price) as total_revenue,
    SUM(quantity) as units_sold
FROM sales
GROUP BY category
ORDER BY total_revenue DESC
"

# Top 5 products by revenue
dataql run -f sales.csv -q "
SELECT
    product,
    SUM(quantity * price) as revenue
FROM sales
GROUP BY product
ORDER BY revenue DESC
LIMIT 5
"

# Daily sales trend
dataql run -f sales.csv -q "
SELECT
    date,
    COUNT(*) as transactions,
    SUM(quantity * price) as daily_revenue
FROM sales
GROUP BY date
ORDER BY date
"

# Regional performance
dataql run -f sales.csv -q "
SELECT
    region,
    category,
    SUM(quantity * price) as revenue
FROM sales
GROUP BY region, category
ORDER BY region, revenue DESC
"

Log Analysis

Analyze application logs to find errors and patterns:

# Create sample log data
cat > app.jsonl << 'EOF'
{"timestamp":"2024-01-15T10:30:00Z","level":"INFO","service":"api","message":"Server started","user_id":null}
{"timestamp":"2024-01-15T10:30:05Z","level":"DEBUG","service":"api","message":"Loading config","user_id":null}
{"timestamp":"2024-01-15T10:31:00Z","level":"ERROR","service":"api","message":"Database connection failed","user_id":null}
{"timestamp":"2024-01-15T10:31:30Z","level":"INFO","service":"auth","message":"User login","user_id":"user123"}
{"timestamp":"2024-01-15T10:32:00Z","level":"WARN","service":"api","message":"High memory usage","user_id":null}
{"timestamp":"2024-01-15T10:32:30Z","level":"ERROR","service":"auth","message":"Invalid token","user_id":"user456"}
{"timestamp":"2024-01-15T10:33:00Z","level":"INFO","service":"api","message":"Request processed","user_id":"user123"}
{"timestamp":"2024-01-15T10:33:30Z","level":"ERROR","service":"api","message":"Timeout error","user_id":"user789"}
EOF

# Count errors by service
dataql run -f app.jsonl -q "
SELECT
    service,
    COUNT(*) as error_count
FROM app
WHERE level = 'ERROR'
GROUP BY service
ORDER BY error_count DESC
"

# Find all errors with details
dataql run -f app.jsonl -q "
SELECT timestamp, service, message
FROM app
WHERE level = 'ERROR'
ORDER BY timestamp
"

# Log level distribution
dataql run -f app.jsonl -q "
SELECT
    level,
    COUNT(*) as count,
    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM app), 2) as percentage
FROM app
GROUP BY level
ORDER BY count DESC
"

# User activity
dataql run -f app.jsonl -q "
SELECT
    user_id,
    COUNT(*) as actions
FROM app
WHERE user_id IS NOT NULL
GROUP BY user_id
"

Data Transformation

CSV to JSONL Conversion

Convert CSV files to JSONL format:

# Convert entire file
dataql run -f data.csv -q "SELECT * FROM data" -e data.jsonl -t jsonl

# Convert with filtering
dataql run -f users.csv \
    -q "SELECT id, name, email FROM users WHERE active = 1" \
    -e active_users.jsonl -t jsonl

# Convert with transformation
dataql run -f orders.csv \
    -q "SELECT id, customer_id, total, date, 'processed' as status FROM orders" \
    -e processed_orders.jsonl -t jsonl

JSON Flattening

Flatten nested JSON structures:

# Create nested JSON
cat > nested.json << 'EOF'
[
  {
    "user": {
      "id": 1,
      "profile": {
        "name": "Alice",
        "contact": {
          "email": "alice@example.com",
          "phone": "555-1234"
        }
      }
    },
    "orders": 5
  },
  {
    "user": {
      "id": 2,
      "profile": {
        "name": "Bob",
        "contact": {
          "email": "bob@example.com",
          "phone": "555-5678"
        }
      }
    },
    "orders": 3
  }
]
EOF

# Query flattened data
dataql run -f nested.json -q "
SELECT
    user_id,
    user_profile_name as name,
    user_profile_contact_email as email,
    orders
FROM nested
"

Data Aggregation

Aggregate data from multiple files:

# Create sample files
cat > jan_sales.csv << 'EOF'
product,amount
Laptop,5000
Mouse,500
Keyboard,300
EOF

cat > feb_sales.csv << 'EOF'
product,amount
Laptop,6000
Mouse,600
Monitor,2000
EOF

# Combine and aggregate
dataql run -f jan_sales.csv -f feb_sales.csv -q "
SELECT
    product,
    SUM(amount) as total
FROM (
    SELECT product, amount FROM jan_sales
    UNION ALL
    SELECT product, amount FROM feb_sales
)
GROUP BY product
ORDER BY total DESC
"

ETL Pipelines

Extract from PostgreSQL, Transform, Load to JSONL

# Extract and transform data from PostgreSQL
dataql run \
    -f "postgres://user:pass@localhost:5432/db?table=orders" \
    -q "
    SELECT
        o.id,
        o.customer_id,
        o.total,
        o.created_at,
        CASE
            WHEN o.total > 1000 THEN 'high'
            WHEN o.total > 100 THEN 'medium'
            ELSE 'low'
        END as value_tier
    FROM orders o
    WHERE o.status = 'completed'
    " \
    -e processed_orders.jsonl -t jsonl

Merge Multiple Data Sources

# Merge data from S3 and local file
dataql run \
    -f "s3://my-bucket/customers.csv" \
    -f orders.csv \
    -q "
    SELECT
        c.name as customer_name,
        c.email,
        COUNT(o.id) as order_count,
        SUM(o.total) as total_spent
    FROM customers c
    LEFT JOIN orders o ON c.id = o.customer_id
    GROUP BY c.id, c.name, c.email
    HAVING order_count > 0
    ORDER BY total_spent DESC
    " \
    -e customer_summary.csv -t csv

Data Quality Checks

# Check for missing values
dataql run -f data.csv -q "
SELECT
    COUNT(*) as total_rows,
    SUM(CASE WHEN email IS NULL OR email = '' THEN 1 ELSE 0 END) as missing_email,
    SUM(CASE WHEN phone IS NULL OR phone = '' THEN 1 ELSE 0 END) as missing_phone
FROM data
"

# Find duplicates
dataql run -f users.csv -q "
SELECT email, COUNT(*) as count
FROM users
GROUP BY email
HAVING count > 1
"

# Validate data ranges
dataql run -f products.csv -q "
SELECT *
FROM products
WHERE price < 0 OR price > 10000
"

Working with APIs

Query JSON API Response

# Query GitHub API
dataql run \
    -f "https://api.github.com/repos/golang/go/releases" \
    -q "SELECT tag_name, name, published_at FROM releases LIMIT 5"

# Query and filter
dataql run \
    -f "https://jsonplaceholder.typicode.com/posts" \
    -q "SELECT id, title FROM posts WHERE userId = 1"

Process curl Output

# Pipe API response
curl -s "https://api.github.com/users/octocat/repos" | \
    dataql run -f - -q "
    SELECT name, stargazers_count, language
    FROM stdin
    WHERE language IS NOT NULL
    ORDER BY stargazers_count DESC
    LIMIT 10
    "

Cloud Data Processing

S3 Data Lake Query

# Query Parquet files in S3
dataql run \
    -f "s3://my-datalake/events/2024/01/events.parquet" \
    -q "
    SELECT
        event_type,
        COUNT(*) as count,
        AVG(duration_ms) as avg_duration
    FROM events
    GROUP BY event_type
    ORDER BY count DESC
    "

Cross-Cloud Data Join

# Join S3 and GCS data
dataql run \
    -f "s3://aws-bucket/products.csv" \
    -f "gs://gcp-bucket/inventory.csv" \
    -q "
    SELECT
        p.name,
        p.price,
        i.quantity,
        i.warehouse
    FROM products p
    JOIN inventory i ON p.sku = i.sku
    WHERE i.quantity < 10
    ORDER BY i.quantity
    "

Interactive Analysis

Start REPL for Exploration

# Load data and start interactive mode
dataql run -f sales.csv

# In REPL:
dataql> .tables
dataql> .schema sales
dataql> SELECT * FROM sales LIMIT 5;
dataql> SELECT category, COUNT(*) FROM sales GROUP BY category;
dataql> .exit

Multi-Table Exploration

# Load multiple tables for interactive exploration
dataql run -f users.csv -f orders.csv -f products.csv

# In REPL:
dataql> .tables
Tables: users, orders, products

dataql> SELECT u.name, COUNT(o.id) as orders
        FROM users u
        LEFT JOIN orders o ON u.id = o.user_id
        GROUP BY u.id;

Performance Optimization

Query Large Files Efficiently

# Limit rows for exploration
dataql run -f huge_file.csv -l 1000 -q "SELECT * FROM huge_file"

# Use Parquet for large datasets
dataql run -f "s3://bucket/large_data.parquet" -q "
SELECT column1, column2
FROM large_data
WHERE date >= '2024-01-01'
LIMIT 10000
"

Persist for Multiple Queries

# Save to SQLite for repeated queries
dataql run -f large_data.csv -s ./data.db

# Later, query the SQLite directly
sqlite3 ./data.db "SELECT * FROM large_data WHERE condition"

Automation Scripts

Daily Report Script

#!/bin/bash
# daily_report.sh

DATE=$(date +%Y-%m-%d)
OUTPUT_DIR="./reports/$DATE"
mkdir -p "$OUTPUT_DIR"

# Generate sales summary
dataql run \
    -f "postgres://user:pass@localhost/db?table=orders" \
    -q "
    SELECT
        category,
        COUNT(*) as orders,
        SUM(total) as revenue
    FROM orders
    WHERE date = '$DATE'
    GROUP BY category
    " \
    -e "$OUTPUT_DIR/sales_summary.csv" -t csv

# Generate error report
dataql run \
    -f "s3://logs-bucket/app/$DATE/logs.jsonl" \
    -q "
    SELECT service, message, COUNT(*) as count
    FROM logs
    WHERE level = 'ERROR'
    GROUP BY service, message
    ORDER BY count DESC
    " \
    -e "$OUTPUT_DIR/errors.jsonl" -t jsonl

echo "Reports generated in $OUTPUT_DIR"

Data Sync Script

#!/bin/bash
# sync_data.sh

# Export from production PostgreSQL
dataql run \
    -f "postgres://readonly:pass@prod-db:5432/app?table=users" \
    -q "SELECT id, email, created_at FROM users WHERE updated_at > '$LAST_SYNC'" \
    -e /tmp/users_delta.jsonl -t jsonl

# Upload to S3
aws s3 cp /tmp/users_delta.jsonl "s3://data-lake/users/$(date +%Y%m%d).jsonl"

# Update sync timestamp
echo $(date -u +%Y-%m-%dT%H:%M:%SZ) > /var/lib/sync/last_sync