First Custom Model

AlexNet Inference

More Information about AlexNet service can be found 🔗link

  1. Implement Custom Model using KServe API
 1import argparse
 2import base64
 3import io
 4import time
 5
 6from fastapi.middleware.cors import CORSMiddleware
 7from torchvision import models, transforms
 8from typing import Dict
 9import torch
10from PIL import Image
11
12import kserve
13from kserve import Model, ModelServer, logging
14from kserve.model_server import app
15from kserve.utils.utils import generate_uuid
16
17
18class AlexNetModel(Model):
19    def __init__(self, name: str):
20        super().__init__(name, return_response_headers=True)
21        self.name = name
22        self.load()
23        self.ready = False
24
25    def load(self):
26        self.model = models.alexnet(pretrained=True)
27        self.model.eval()
28        # The ready flag is used by model ready endpoint for readiness probes,
29        # set to True when model is loaded successfully without exceptions.
30        self.ready = True
31
32    async def predict(
33        self,
34        payload: Dict,
35        headers: Dict[str, str] = None,
36        response_headers: Dict[str, str] = None,
37    ) -> Dict:
38        start = time.time()
39        # Input follows the Tensorflow V1 HTTP API for binary values
40        # https://www.tensorflow.org/tfx/serving/api_rest#encoding_binary_values
41        img_data = payload["instances"][0]["image"]["b64"]
42        raw_img_data = base64.b64decode(img_data)
43        input_image = Image.open(io.BytesIO(raw_img_data))
44        preprocess = transforms.Compose([
45            transforms.Resize(256),
46            transforms.CenterCrop(224),
47            transforms.ToTensor(),
48            transforms.Normalize(mean=[0.485, 0.456, 0.406],
49                                 std=[0.229, 0.224, 0.225]),
50        ])
51        input_tensor = preprocess(input_image).unsqueeze(0)
52        output = self.model(input_tensor)
53        torch.nn.functional.softmax(output, dim=1)
54        values, top_5 = torch.topk(output, 5)
55        result = values.flatten().tolist()
56        end = time.time()
57        response_id = generate_uuid()
58
59        # Custom response headers can be added to the inference response
60        if response_headers is not None:
61            response_headers.update(
62                {"prediction-time-latency": f"{round((end - start) * 1000, 9)}"}
63            )
64
65        return {"predictions": result}
66
67
68parser = argparse.ArgumentParser(parents=[kserve.model_server.parser])
69args, _ = parser.parse_known_args()
70
71if __name__ == "__main__":
72    # Configure kserve and uvicorn logger
73    if args.configure_logging:
74        logging.configure_logging(args.log_config_file)
75    model = AlexNetModel(args.model_name)
76    model.load()
77    # Custom middlewares can be added to the model
78    app.add_middleware(
79        CORSMiddleware,
80        allow_origins=["*"],
81        allow_credentials=True,
82        allow_methods=["*"],
83        allow_headers=["*"],
84    )
85    ModelServer().start([model])
  1. create requirements.txt
kserve
torchvision==0.18.0
pillow>=10.3.0,<11.0.0
  1. create Dockerfile
FROM m.daocloud.io/docker.io/library/python:3.11-slim

WORKDIR /app

COPY requirements.txt .
RUN pip install --no-cache-dir  -r requirements.txt 

COPY model.py .

CMD ["python", "model.py", "--model_name=custom-model"]
  1. build and push custom docker image
docker build -t ay-custom-model .
docker tag ddfd0186813e docker-registry.lab.zverse.space/ay/ay-custom-model:latest
docker push docker-registry.lab.zverse.space/ay/ay-custom-model:latest
  1. create a namespace
kubectl create namespace kserve-test
  1. deploy a sample custom-model service
kubectl apply -n kserve-test -f - <<EOF
apiVersion: serving.kserve.io/v1beta1
kind: InferenceService
metadata:
  name: ay-custom-model
spec:
  predictor:
    containers:
      - name: kserve-container
        image: docker-registry.lab.zverse.space/ay/ay-custom-model:latest
EOF
  1. Check InferenceService status
kubectl -n kserve-test get inferenceservices ay-custom-model
kubectl -n kserve-test get pod
#NAME                                           READY   STATUS    RESTARTS   AGE
#ay-custom-model-predictor-00003-dcf4rk         2/2     Running   0        167m

kubectl -n kserve-test get inferenceservices ay-custom-model
#NAME           URL   READY     PREV   LATEST   PREVROLLEDOUTREVISION   LATESTREADYREVISION   AGE
#ay-custom-model   http://ay-custom-model.kserve-test.example.com   True           100                              ay-custom-model-predictor-00003   177m

After all pods are ready, you can access the service by using the following command

Access By

If the EXTERNAL-IP value is set, your environment has an external load balancer that you can use for the ingress gateway.

export INGRESS_HOST=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
export INGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=="http2")].nodePort}')

If the EXTERNAL-IP value is none (or perpetually pending), your environment does not provide an external load balancer for the ingress gateway. In this case, you can access the gateway using the service’s node port.

export INGRESS_HOST=$(minikube ip)
export INGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=="http2")].nodePort}')
export INGRESS_HOST=$(minikube ip)
kubectl port-forward --namespace istio-system svc/istio-ingressgateway 30080:80
export INGRESS_PORT=30080
  1. Perform a prediction

First, prepare your inference input request inside a file:

wget -O ./alex-net-input.json https://kserve.github.io/website/0.15/modelserving/v1beta1/custom/custom_model/input.json
ssh -i ~/.minikube/machines/minikube/id_rsa docker@$(minikube ip) -L "*:${INGRESS_PORT}:0.0.0.0:${INGRESS_PORT}" -N -f
  1. Invoke the service
export SERVICE_HOSTNAME=$(kubectl -n kserve-test get inferenceservice ay-custom-model  -o jsonpath='{.status.url}' | cut -d "/" -f 3)
# http://ay-custom-model.kserve-test.example.com
curl -v -H "Host: ${SERVICE_HOSTNAME}" -H "Content-Type: application/json" -X POST "http://${INGRESS_HOST}:${INGRESS_PORT}/v1/models/custom-model:predict" -d @.//alex-net-input.json
*   Trying 192.168.58.2:30704...
* Connected to 192.168.58.2 (192.168.58.2) port 30704
> POST /v1/models/custom-model:predict HTTP/1.1
> Host: ay-custom-model.kserve-test.example.com
> User-Agent: curl/8.5.0
> Accept: */*
> Content-Type: application/json
> Content-Length: 105339
> 
* We are completely uploaded and fine
< HTTP/1.1 200 OK
< content-length: 110
< content-type: application/json
< date: Wed, 11 Jun 2025 03:38:30 GMT
< prediction-time-latency: 89.966773987
< server: istio-envoy
< x-envoy-upstream-service-time: 93
< 
* Connection #0 to host 192.168.58.2 left intact
{"predictions":[14.975619316101074,14.0368070602417,13.966034889221191,12.252280235290527,12.086270332336426]}