hive-apps/projects/ai/values/localai.yaml

replicaCount: 1

deployment:
  image:
    repository: quay.io/go-skynet/local-ai
    tag: v2.11.0-cublas-cuda12-ffmpeg-core
  pullPolicy: Always
  runtimeClassName: nvidia

  env:
    threads: 16
    context_size: 4096
    DEBUG: "true"
    #
    # SINGLE_ACTIVE_BACKEND: "true"
    # PYTHON_GRPC_MAX_WORKERS: "1"
    # LLAMACPP_PARALLEL: "1"
    # PARALLEL_REQUESTS: "false"

    ## Specify a different bind address (defaults to ":8080")
    # ADDRESS=127.0.0.1:8080

    ## Define galleries.
    ## models will to install will be visible in `/models/available`
    GALLERIES: '[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]'

    ## Default path for models
    #MODELS_PATH=/models

    ## Enable debug mode
    #DEBUG=true

    ## Disables COMPEL (Lets Stable Diffuser work, uncomment if you plan on using it)
    # COMPEL=0

    ## Enable/Disable single backend (useful if only one GPU is available)
    # SINGLE_ACTIVE_BACKEND=true

    ## Specify a build type. Available: cublas, openblas, clblas.
    #BUILD_TYPE=cublas

    ## Uncomment and set to true to enable rebuilding from source
    # REBUILD=true

    ## Enable go tags, available: stablediffusion, tts
    ## stablediffusion: image generation with stablediffusion
    ## tts: enables text-to-speech with go-piper
    ## (requires REBUILD=true)
    #
    #GO_TAGS=tts

    ## Path where to store generated images
    # IMAGE_PATH=/tmp

    ## Specify a default upload limit in MB (whisper)
    # UPLOAD_LIMIT

    # HUGGINGFACEHUB_API_TOKEN=Token here

  modelsPath: "/models"
  download_model:
    # To use cloud provided (eg AWS) image, provide it like: 1234356789.dkr.ecr.us-REGION-X.amazonaws.com/busybox
    image: busybox
  prompt_templates:
    # To use cloud provided (eg AWS) image, provide it like: 1234356789.dkr.ecr.us-REGION-X.amazonaws.com/busybox
    image: busybox

resources:
  requests:
    memory: 100Mi
  limits:
    memory: 64Gi

# Note: the keys of this map will be the names of the prompt template files
promptTemplates:
  {}
  # ggml-gpt4all-j.tmpl: |
  #   The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response.
  #   ### Prompt:
  #   {{.Input}}
  #   ### Response:

# Models to download at runtime
models:
  # Whether to force download models even if they already exist
  forceDownload: false

  # The list of URLs to download models from
  # Note: the name of the file will be the name of the loaded model
  list: []
    # - url: "https://gpt4all.io/models/ggml-gpt4all-j.bin"
      # basicAuth: base64EncodedCredentials

persistence:
  models:
    enabled: true
    annotations: {}
    storageClass: ssd
    accessModes: ReadWriteOnce
    size: 100Gi
    globalMount: /models
  output:
    enabled: false
    annotations: {}
    storageClass: ssd
    accessModes: ReadWriteOnce
    size: 100Gi
    globalMount: /tmp/generated

service:
  type: ClusterIP
  port: 80

ingress:
  enabled: true
  className: "ingress-internal"
  annotations:
    cert-manager.io/cluster-issuer: vault-issuer
    nginx.ingress.kubernetes.io/proxy-buffering: "off"
    nginx.ingress.kubernetes.io/proxy-http-version: "1.1"
    nginx.ingress.kubernetes.io/chunked-transfer-encoding: "on"
    nginx.ingress.kubernetes.io/proxy_request_buffering: "off"
  hosts:
    - host: ai.dc
      paths:
        - path: /
          pathType: ImplementationSpecific
  tls:
    - secretName: localai-tls
      hosts:
        - ai.dc