COMPLETE TROUBLESHOOTING GUIDE - nself-org/cli GitHub Wiki
Version 0.9.9 | Symptom → Diagnosis → Solution
- Quick Diagnostic Commands
- Services Won't Start
- Database Issues
- Authentication Problems
- API/GraphQL Errors
- Performance Issues
- Storage Problems
- Network/Connectivity Issues
- SSL/Certificate Errors
- Build/Configuration Errors
- Monitoring Not Working
- Multi-Tenancy Issues
Run these first to gather information:
# 1. Overall system status
nself status
# 2. Check for errors in logs
nself logs --level error --since 1h
# 3. Run diagnostics
nself doctor
# 4. Check resource usage
nself metrics
# 5. Verify configuration
nself config validate
# 6. Check service health
nself healthDiagnostic:
# Check what failed
nself status
# View detailed logs
nself logs
# Check Docker daemon
docker info
# Check for port conflicts
sudo lsof -i -P -n | grep LISTENCommon Causes & Solutions:
Error:
Error starting userland proxy: listen tcp4 0.0.0.0:5432: bind: address already in use
Solution:
# Find process using port
sudo lsof -i :5432
# Kill the process
sudo kill -9 PID
# Or change port in .env
POSTGRES_PORT=5433
# Rebuild and restart
nself build && nself startError:
Cannot connect to the Docker daemon. Is the docker daemon running?
Solution:
# macOS
open -a Docker
# Linux
sudo systemctl start docker
sudo systemctl enable docker
# Verify
docker infoError:
Container exited with code 137 (OOM Killed)
Solution:
# Check Docker memory limits
docker info | grep Memory
# Increase Docker memory (Docker Desktop)
# Settings → Resources → Memory → Increase to 8GB+
# Reduce service memory limits
# Edit docker-compose.yml:
services:
postgres:
mem_limit: 2G # Reduce if needed
# Restart
nself restartError:
Error: POSTGRES_PASSWORD is required
Solution:
# Check .env file exists
ls -la .env
# Validate configuration
nself config validate
# Generate missing variables
nself init --reconfigure
# Or manually add to .env:
POSTGRES_PASSWORD=$(openssl rand -base64 32)Error:
Error response from daemon: network ... not found
Solution:
# Clean up Docker state
nself stop
docker system prune -af --volumes
# Rebuild
nself build
nself startDiagnostic:
# Check recent container logs
nself logs service-name --tail 100
# Check exit code
docker ps -a | grep service-nameCommon Causes & Solutions:
Check logs for:
Error parsing config file
Invalid configuration
Solution:
# Validate service config
nself config validate
# Check service-specific config
cat docker-compose.yml | grep service-name -A 20
# Rebuild configuration
nself build --force
nself restart service-nameSymptom: Hasura starts before PostgreSQL is ready
Solution:
# Increase health check timeout
# Edit docker-compose.yml:
services:
hasura:
depends_on:
postgres:
condition: service_healthy
postgres:
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 5s
timeout: 5s
retries: 10 # Increase this
# Restart
nself restartError:
could not connect to server: Connection refused
Diagnostic:
# Check if PostgreSQL is running
nself status --services postgres
# Try manual connection
docker exec -it $(docker ps -qf "name=postgres") psql -U postgres
# Check logs
nself logs postgres --tail 50Solutions:
# Start PostgreSQL
nself start postgres
# Wait for ready
nself health postgres --wait 60# Check .env for correct credentials
grep POSTGRES .env
# Reset password
POSTGRES_PASSWORD=new-password
# Rebuild
nself build && nself restart postgresError:
FATAL: sorry, too many clients already
Solution:
# Check active connections
nself db query "SELECT count(*) FROM pg_stat_activity;"
# Increase connection limit
# .env:
POSTGRES_MAX_CONNECTIONS=200
# Rebuild
nself build && nself restart postgres
# Or use connection pooler (PgBouncer)
nself service enable pgbouncerDiagnostic:
# Find slow queries
nself db query "
SELECT query, calls, mean_exec_time, max_exec_time
FROM pg_stat_statements
ORDER BY mean_exec_time DESC
LIMIT 10;
"
# Check for missing indexes
nself db inspect indexes --missing
# Check table sizes
nself db inspect tables --sizeSolutions:
# Identify table scans
nself db query "
SELECT schemaname, tablename, seq_scan, seq_tup_read
FROM pg_stat_user_tables
WHERE seq_scan > 0
ORDER BY seq_tup_read DESC
LIMIT 10;
"
# Create index
nself db query "CREATE INDEX CONCURRENTLY idx_users_email ON users(email);"
# Analyze table
nself db query "ANALYZE users;"# Check for bloat
nself db inspect bloat
# Vacuum database
nself db optimize vacuum
# For severe bloat
nself db optimize vacuum-full # WARNING: Locks tables# Check PostgreSQL memory
nself status postgres --verbose
# Increase shared_buffers
# .env:
POSTGRES_SHARED_BUFFERS=4GB
# Rebuild
nself build && nself restart postgresError:
Invalid credentials
Diagnostic:
# Check Auth service
nself status --services auth
# Check Auth logs
nself logs auth --tail 50
# Verify user exists
nself db query "SELECT email, email_verified FROM auth.users WHERE email = '[email protected]';"Solutions:
# Manually verify email
nself auth verify-email [email protected]
# Or via database
nself db query "UPDATE auth.users SET email_verified = true WHERE email = '[email protected]';"# Check lock status
nself db query "SELECT email, locked_until FROM auth.users WHERE email = '[email protected]';"
# Unlock account
nself auth unlock [email protected]Error:
Invalid JWT signature
Solution:
# Check JWT secret consistency
grep JWT .env
# Must match between Auth and Hasura
HASURA_GRAPHQL_JWT_SECRET='{"type":"HS256","key":"same-key-here"}'
AUTH_JWT_SECRET=same-key-here
# Rebuild
nself build && nself restartDiagnostic:
# Check session duration settings
grep SESSION .env
# Check Redis (if session store)
docker exec -it $(docker ps -qf "name=redis") redis-cli KEYS "session:*"Solution:
# Increase session duration
# .env:
AUTH_ACCESS_TOKEN_EXPIRES_IN=900 # 15 minutes
AUTH_REFRESH_TOKEN_EXPIRES_IN=2592000 # 30 days
# Rebuild
nself build && nself restart authError:
{
"errors": [
{
"message": "field \"users\" not found in type: 'query_root'",
"extensions": {
"code": "validation-failed"
}
}
]
}
Diagnostic:
# Check Hasura permissions
# Open Hasura Console
nself admin hasura
# Check table permissions
nself db query "SELECT * FROM information_schema.table_privileges WHERE grantee = 'your_role';"Solutions:
# Grant select permission in Hasura Console
# Or via metadata:
nself hasura metadata apply -f hasura/metadata/tables.yaml# Check RLS policies
nself db query "SELECT * FROM pg_policies WHERE tablename = 'users';"
# Temporarily disable RLS (development only!)
nself db query "ALTER TABLE users DISABLE ROW LEVEL SECURITY;"
# Or fix policy
nself db query "
CREATE POLICY user_select_own ON users
FOR SELECT
USING (id = current_setting('hasura.user.id')::uuid);
"Diagnostic:
# Check Hasura logs for slow queries
nself logs hasura | grep "duration"
# Enable query logging
# .env:
HASURA_GRAPHQL_ENABLE_CONSOLE=true
HASURA_GRAPHQL_DEV_MODE=true
HASURA_GRAPHQL_ENABLE_TELEMETRY=false
HASURA_GRAPHQL_LOG_LEVEL=warn
# View in Hasura Console "Analyze" tabSolutions:
Before (slow):
{
users {
id
posts { # Separate query per user
title
}
}
}After (fast - use relationship):
# Define relationship in Hasura first
# Then query efficiently
{
users {
id
posts_aggregate { # Single query
aggregate {
count
}
}
}
}# Hasura shows which fields are queried most
# Create indexes for them
nself db query "CREATE INDEX CONCURRENTLY idx_posts_user_id ON posts(user_id);"Diagnostic:
# Check which service
nself status --verbose
# View container stats
docker stats
# Check process inside container
nself exec postgres topSolutions:
# Find expensive queries
nself db query "
SELECT pid, usename, query, state
FROM pg_stat_activity
WHERE state = 'active' AND query NOT LIKE '%pg_stat_activity%'
ORDER BY query_start;
"
# Kill runaway query
nself db query "SELECT pg_terminate_backend(12345);"
# Add query timeout
# .env:
POSTGRES_STATEMENT_TIMEOUT=30000 # 30 seconds# Check active subscriptions
nself logs hasura | grep "subscription"
# Limit subscriptions per connection
# .env:
HASURA_GRAPHQL_LIVE_QUERIES_MULTIPLEXED_REFETCH_INTERVAL=1000
HASURA_GRAPHQL_LIVE_QUERIES_MULTIPLEXED_BATCH_SIZE=100Diagnostic:
# View memory by container
docker stats --no-stream
# Check for memory leaks
nself logs --grep "memory" --since 24hSolutions:
# Reduce shared_buffers
# .env:
POSTGRES_SHARED_BUFFERS=2GB # Down from 8GB
# Reduce work_mem
POSTGRES_WORK_MEM=32MB # Down from 64MB
# Restart
nself restart postgres# Check custom service memory
docker stats custom_service
# Restart service (temporary fix)
nself restart custom_service
# Find leak (add to service):
node --max-old-space-size=2048 --expose-gc app.js
# Use heap snapshots
npm install -g node-heapdump
# Take snapshot
kill -USR2 <pid>Error:
Failed to upload file: Permission denied
Diagnostic:
# Check MinIO status
nself status --services minio
# Check MinIO logs
nself logs minio
# Test connectivity
curl https://minio.local.nself.org/healthSolutions:
# Start MinIO
nself start minio
# Verify
nself urls | grep minio# List buckets
nself exec minio mc ls local
# Create bucket
nself exec minio mc mb local/my-bucket
# Set public policy (if needed)
nself exec minio mc anonymous set download local/my-bucket# Configure CORS
nself exec minio mc anonymous set-json /tmp/cors.json local/my-bucket
# cors.json:
cat > /tmp/cors.json <<EOF
{
"CORSRules": [
{
"AllowedOrigins": ["*"],
"AllowedMethods": ["GET", "PUT", "POST", "DELETE"],
"AllowedHeaders": ["*"]
}
]
}
EOFError:
This site can't be reached
Diagnostic:
# Check Nginx status
nself status --services nginx
# Check Nginx logs
nself logs nginx
# Test internally
nself exec nginx curl http://hasura:8080/healthz
# Check DNS resolution
nslookup api.local.nself.orgSolutions:
# Start Nginx
nself start nginx
# Check for config errors
nself exec nginx nginx -t# Check /etc/hosts
grep nself /etc/hosts
# Should have:
127.0.0.1 local.nself.org
127.0.0.1 api.local.nself.org
127.0.0.1 auth.local.nself.org
# If missing, rebuild
nself build# Check firewall status
sudo ufw status
# Allow ports
sudo ufw allow 80/tcp
sudo ufw allow 443/tcp
# Reload
sudo ufw reloadError:
Your connection is not private
NET::ERR_CERT_AUTHORITY_INVALID
Solutions:
# Trust self-signed cert (macOS)
sudo security add-trusted-cert -d -r trustRoot -k /Library/Keychains/System.keychain ssl/cert.pem
# Chrome: Type "thisisunsafe" on warning page# Check expiry
openssl x509 -in ssl/cert.pem -noout -dates
# Regenerate (development)
nself config ssl generate
# Production: Renew Let's Encrypt
sudo certbot renew
nself restart nginxError:
ERR_CERT_COMMON_NAME_INVALID
Solution:
# Check certificate CN
openssl x509 -in ssl/cert.pem -noout -subject
# Should match domain in browser
# If not, regenerate for correct domain
openssl req -new -x509 -days 365 -nodes \
-out ssl/cert.pem \
-keyout ssl/key.pem \
-subj "/CN=yourdomain.com"Diagnostic:
# Run build with verbose output
nself build --verbose
# Validate configuration
nself config validate
# Check for syntax errors in .env
cat .env | grep -v "^#" | grep -v "^$"Solutions:
Error:
Error: Invalid value for POSTGRES_PORT
Solution:
# Check value type
grep POSTGRES_PORT .env
# Must be number
POSTGRES_PORT=5432 # Not "5432" (no quotes)
# Rebuild
nself buildError:
Error: PROJECT_NAME is required
Solution:
# Add to .env
PROJECT_NAME=my-app
# Or re-run init
nself init --reconfigureError:
Error parsing template: unexpected EOF
Solution:
# Clean generated files
rm -rf docker-compose.yml nginx/ services/
# Rebuild from scratch
nself build --cleanDiagnostic:
# Check monitoring services
nself status --services prometheus,grafana,loki
# Check Prometheus targets
curl https://prometheus.local.nself.org/api/v1/targets
# Check Grafana datasources
nself exec grafana grafana-cli admin reset-admin-password adminSolutions:
# Check Prometheus config
nself exec prometheus cat /etc/prometheus/prometheus.yml
# Verify targets are up
# Open: https://prometheus.local.nself.org/targets
# Restart Prometheus
nself restart prometheus# Open Grafana
nself monitor
# Go to Configuration → Data Sources
# Add Prometheus:
# URL: http://prometheus:9090
# Access: Server
# Test and Save# Check Promtail is running
nself status --services promtail
# Promtail must be running for logs to reach Loki!
# If not enabled:
# .env:
MONITORING_ENABLED=true
# Rebuild
nself build && nself restartCRITICAL SECURITY ISSUE
Diagnostic:
# Check RLS is enabled
nself db query "
SELECT schemaname, tablename, rowsecurity
FROM pg_tables
WHERE schemaname = 'public';
"
# Check current tenant context
nself db query "SELECT current_setting('app.tenant_id', true);"Solution:
# Enable RLS on all tables
nself db query "ALTER TABLE users ENABLE ROW LEVEL SECURITY;"
# Create tenant isolation policy
nself db query "
CREATE POLICY tenant_isolation ON users
USING (tenant_id = current_setting('app.tenant_id')::uuid);
"
# Verify
nself tenant verify-isolationEnable maximum verbosity:
# .env:
ENV=development
LOG_LEVEL=debug
HASURA_GRAPHQL_DEV_MODE=true
HASURA_GRAPHQL_ENABLE_CONSOLE=true
# Rebuild
nself build && nself restart
# View all logs
nself logs -f --timestamps-
Run diagnostics:
nself doctor > diagnostics.txt -
Collect logs:
nself logs --since 1h > logs.txt -
System info:
nself version docker version uname -a
-
Search existing issues:
- GitHub: https://github.com/nself-org/cli/issues
- Discord: https://discord.gg/nself
Include:
- nself version
- Operating system
- Docker version
- Full error message
- Steps to reproduce
- Relevant logs
- Configuration (with secrets redacted)
Template:
**nself version:** 0.9.8
**OS:** Ubuntu 22.04
**Docker:** 24.0.5
**Issue:** Services won't start after upgrading
**Steps to reproduce:**
1. Ran `nself update`
2. Ran `nself restart`
3. PostgreSQL fails to start
**Error:**ERROR: could not access file "pg_hba.conf": Permission denied
**Logs:**
<attached logs.txt>
**Config:**
<attached diagnostics.txt>
Emergency Support:
- Critical Production Issues: [email protected]
- Security Vulnerabilities: [email protected]
- Community Discord: https://discord.gg/nself