replicaCount: 1 deployment: image: repository: quay.io/go-skynet/local-ai #tag: latest-aio-gpu-nvidia-cuda-12 tag: v2.22.1-cublas-cuda12-ffmpeg pullPolicy: Always runtimeClassName: nvidia env: threads: 16 context_size: 4096 DEBUG: "true" # # SINGLE_ACTIVE_BACKEND: "true" # PYTHON_GRPC_MAX_WORKERS: "1" # LLAMACPP_PARALLEL: "1" # PARALLEL_REQUESTS: "false" ## Specify a different bind address (defaults to ":8080") # ADDRESS=127.0.0.1:8080 ## Define galleries. ## models will to install will be visible in `/models/available` #GALLERIES: '[{"name":"model-gallery", "url":"github:go-skynet/model-gallery/index.yaml"}, {"url": "github:go-skynet/model-gallery/huggingface.yaml","name":"huggingface"}]' ## Default path for models #MODELS_PATH=/models ## Enable debug mode #DEBUG=true ## Disables COMPEL (Lets Stable Diffuser work, uncomment if you plan on using it) # COMPEL=0 ## Enable/Disable single backend (useful if only one GPU is available) # SINGLE_ACTIVE_BACKEND=true ## Specify a build type. Available: cublas, openblas, clblas. #BUILD_TYPE=cublas ## Uncomment and set to true to enable rebuilding from source # REBUILD=true ## Enable go tags, available: stablediffusion, tts ## stablediffusion: image generation with stablediffusion ## tts: enables text-to-speech with go-piper ## (requires REBUILD=true) # #GO_TAGS=tts ## Path where to store generated images # IMAGE_PATH=/tmp ## Specify a default upload limit in MB (whisper) # UPLOAD_LIMIT # HUGGINGFACEHUB_API_TOKEN=Token here # Inject Secrets into Environment: secretEnv: - name: HF_TOKEN valueFrom: secretKeyRef: name: localai key: hf-token modelsPath: "/models" download_model: # To use cloud provided (eg AWS) image, provide it like: 1234356789.dkr.ecr.us-REGION-X.amazonaws.com/busybox image: busybox prompt_templates: # To use cloud provided (eg AWS) image, provide it like: 1234356789.dkr.ecr.us-REGION-X.amazonaws.com/busybox image: busybox resources: requests: memory: 100Mi limits: memory: 64Gi # Note: the keys of this map will be the names of the prompt template files promptTemplates: {} # ggml-gpt4all-j.tmpl: | # The prompt below is a question to answer, a task to complete, or a conversation to respond to; decide which and write an appropriate response. # ### Prompt: # {{.Input}} # ### Response: # Models to download at runtime models: # Whether to force download models even if they already exist forceDownload: false # The list of URLs to download models from # Note: the name of the file will be the name of the loaded model list: [] # - url: "https://gpt4all.io/models/ggml-gpt4all-j.bin" # basicAuth: base64EncodedCredentials persistence: models: enabled: true annotations: {} storageClass: ssd accessModes: ReadWriteOnce size: 100Gi globalMount: /models output: enabled: false annotations: {} storageClass: ssd accessModes: ReadWriteOnce size: 100Gi globalMount: /tmp/generated service: type: ClusterIP port: 80 ingress: enabled: true className: "ingress-internal" annotations: cert-manager.io/cluster-issuer: vault-issuer nginx.ingress.kubernetes.io/proxy-buffering: "off" nginx.ingress.kubernetes.io/proxy-http-version: "1.1" nginx.ingress.kubernetes.io/chunked-transfer-encoding: "on" nginx.ingress.kubernetes.io/proxy_request_buffering: "off" hosts: - host: ai.dc paths: - path: / pathType: ImplementationSpecific tls: - secretName: localai-tls hosts: - ai.dc