deb http://archive.ubuntu.com/ubuntu/ bionic main restricted
deb http://archive.ubuntu.com/ubuntu/ bionic-updates main restricted
deb http://archive.ubuntu.com/ubuntu/ bionic-backports main restricted universe multiverse
deb http://security.ubuntu.com/ubuntu/ bionic-security main restricted
Ubuntu 20.04 located in /etc/apt/sources.list
deb http://archive.ubuntu.com/ubuntu/ focal main restricted universe multiverse
deb http://archive.ubuntu.com/ubuntu/ focal-updates main restricted universe multiverse
deb http://archive.ubuntu.com/ubuntu/ focal-backports main restricted universe multiverse
deb http://security.ubuntu.com/ubuntu/ focal-security main restricted
Ubuntu 22.04 located in /etc/apt/sources.list
deb http://archive.ubuntu.com/ubuntu/ jammy main restricted
deb http://archive.ubuntu.com/ubuntu/ jammy-updates main restricted
deb http://archive.ubuntu.com/ubuntu/ jammy-backports main restricted universe multiverse
deb http://security.ubuntu.com/ubuntu/ jammy-security main restricted
Debian
Debian Buster located in /etc/apt/sources.list
deb http://deb.debian.org/debian buster main
deb http://security.debian.org/debian-security buster/updates main
deb http://deb.debian.org/debian buster-updates main
deb http://mirrors.aliyun.com/debian/ buster main non-free contrib
deb http://mirrors.aliyun.com/debian-security buster/updates main
deb http://mirrors.aliyun.com/debian/ buster-updates main non-free contrib
deb http://mirrors.aliyun.com/debian/ buster-backports main non-free contrib
deb http://mirrors.tuna.tsinghua.edu.cn/debian/ buster main contrib non-free
deb http://mirrors.tuna.tsinghua.edu.cn/debian/ buster-updates main contrib non-free
deb http://mirrors.tuna.tsinghua.edu.cn/debian/ buster-backports main contrib non-free
deb http://security.debian.org/debian-security buster/updates main contrib non-free
Debian Bullseye located in /etc/apt/sources.list
deb http://deb.debian.org/debian bullseye main
deb http://security.debian.org/debian-security bullseye-security main
deb http://deb.debian.org/debian bullseye-updates main
deb http://mirrors.aliyun.com/debian/ bullseye main non-free contrib
deb http://mirrors.aliyun.com/debian-security/ bullseye-security main
deb http://mirrors.aliyun.com/debian/ bullseye-updates main non-free contrib
deb http://mirrors.aliyun.com/debian/ bullseye-backports main non-free contrib
deb http://mirrors.tuna.tsinghua.edu.cn/debian/ bullseye main contrib non-free
deb http://mirrors.tuna.tsinghua.edu.cn/debian/ bullseye-updates main contrib non-free
deb http://mirrors.tuna.tsinghua.edu.cn/debian/ bullseye-backports main contrib non-free
deb http://security.debian.org/debian-security bullseye-security main contrib non-free
create an ES backup setting in s3, and make an snapshot after creation
#!/bin/bash
ES_HOST="http://192.168.58.2:30910"ES_BACKUP_REPO_NAME="s3_fs_repository"S3_CLIENT="default"ES_BACKUP_BUCKET_IN_S3="es-snapshot"ES_SNAPSHOT_TAG="auto"CHECK_RESPONSE=$(curl -s -k -X POST "$ES_HOST/_snapshot/$ES_BACKUP_REPO_NAME/_verify?pretty")CHECKED_NODES=$(echo"$CHECK_RESPONSE"| jq -r '.nodes')if["$CHECKED_NODES"== null ];thenecho"Doesn't exist an ES backup setting..."echo"A default backup setting will be generated. (using '$S3_CLIENT' s3 client and all backup files will be saved in a bucket : '$ES_BACKUP_BUCKET_IN_S3'"CREATE_RESPONSE=$(curl -s -k -X PUT "$ES_HOST/_snapshot/$ES_BACKUP_REPO_NAME?pretty" -H 'Content-Type: application/json' -d "{\"type\":\"s3\",\"settings\":{\"bucket\":\"$ES_BACKUP_BUCKET_IN_S3\",\"client\":\"$S3_CLIENT\"}}")CREATE_ACKNOWLEDGED_FLAG=$(echo"$CREATE_RESPONSE"| jq -r '.acknowledged')if["$CREATE_ACKNOWLEDGED_FLAG"==true];thenecho"Buckup setting '$ES_BACKUP_REPO_NAME' has been created successfully!"elseecho"Failed to create backup setting '$ES_BACKUP_REPO_NAME', since $$CREATE_RESPONSE"fielseecho"Already exist an ES backup setting '$ES_BACKUP_REPO_NAME'"fiCHECK_RESPONSE=$(curl -s -k -X POST "$ES_HOST/_snapshot/$ES_BACKUP_REPO_NAME/_verify?pretty")CHECKED_NODES=$(echo"$CHECK_RESPONSE"| jq -r '.nodes')if["$CHECKED_NODES" != null ];thenSNAPSHOT_NAME="meta-data-$ES_SNAPSHOT_TAG-snapshot-$(date +%s)"SNAPSHOT_CREATION=$(curl -s -k -X PUT "$ES_HOST/_snapshot/$ES_BACKUP_REPO_NAME/$SNAPSHOT_NAME")echo"Snapshot $SNAPSHOT_NAME has been created."elseecho"Failed to create snapshot $SNAPSHOT_NAME ."fi
Login Without Pwd
copy id_rsa to other nodes
yum install sshpass -y
mkdir -p /extend/shell
cat >>/extend/shell/fenfa_pub.sh<< EOF
#!/bin/bash
ROOT_PASS=root123
ssh-keygen -t rsa -f ~/.ssh/id_rsa -P ''
for ip in 101 102 103
do
sshpass -p\$ROOT_PASS ssh-copy-id -o StrictHostKeyChecking=no 192.168.29.\$ip
done
EOFcd /extend/shell
chmod +x fenfa_pub.sh
./fenfa_pub.sh
Returns documents that contain an exact term in a provided field.
You can use the term query to find documents based on a precise value such as a price, a product ID, or a username.
GET /_search
{"query": {"term": {"filed_A": {"value": "kimchy",
"boost": 1.0
}}}}
wildcard query
Returns documents that contain terms matching a wildcard pattern.
A wildcard operator is a placeholder that matches one or more characters. For example, the * wildcard operator matches zero or more characters. You can combine wildcard operators with other characters to create a wildcard pattern.
Using deep learning techniques to help you to win the game.
State MachineEvent BusPython 3.6TensorFlow2Captain InfoNewAwesome
ScreenShots
There are four main funcs in this tool.
The first one is to detect your game client thread and recognize which status you are in.
The second one is to recommend some champions to play.
Based on your enemy’s team banned champion, this tool will provide you three
more choices to counter your enemies.
The third func will scans the mini-map, and when someone is heading to you,
a notification window will pop up.
The last func will provides you some gear recommendation based on your enemy’s item list.
using default flink-s3-fs-hadoop, the configuration value will set into Hadoop configuration map.
Only one value functioning at the same, there is no way for user to operate different in single one job context.
the jar we provided was based on original flink-s3-fs-hadoop plugin, so you should use original protocal prefix s3a://
Or maybe you can wait from the PR, after I mereged into flink-master, you don't need to do anything, just update your flink version. and directly use s3u://
continuously processing antenna signal records and convert them into 3 dimension data matrixes, sending them to different astronomical algorithm endpoints.
how data flows
Building From Zero
Following these steps, you may build comic-antenna from nothing.
ExceptionRecord, RetryControl, SpeedControl are provided by the yaml crawler itself, dont worry.
you only need to extend how to process your page MainPage, for example, you defined a MainPageProcessor.
each processor will produce a set of other page or DownloadPage. DownloadPage like a ship containing
information you need, and this framework will help you process DownloadPage and download or persist.
More Ofthen, we can get a simplest example form CDC Connectors. But people still need to google some inescapable problems before using it.
preliminary
Flink: 1.17
JDK: 11
Flink CDC Version
Flink Version
1.0.0
1.11.*
1.1.0
1.11.*
1.2.0
1.12.*
1.3.0
1.12.*
1.4.0
1.13.*
2.0.*
1.13.*
2.1.*
1.13.*
2.2.*
1.13.*, 1.14.*
2.3.*
1.13.*, 1.14.*, 1.15.*
2.4.*
1.13.*, 1.14.*, 1.15.*
3.0.*
1.14.*, 1.15.*, 1.16.*
usage for DataStream API
Only import com.ververica.flink-connector-mysql-cdc is not enough.
implementation("com.ververica:flink-connector-mysql-cdc:2.4.0")//you also need these following dependencies
implementation("org.apache.flink:flink-shaded-guava:30.1.1-jre-16.1")implementation("org.apache.flink:flink-connector-base:1.17")implementation("org.apache.flink:flink-table-planner_2.12:1.17")
<dependency><groupId>com.ververica</groupId><!-- add the dependency matching your database --><artifactId>flink-connector-mysql-cdc</artifactId><!-- The dependency is available only for stable releases, SNAPSHOT dependencies need to be built based on master or release- branches by yourself. --><version>2.4.0</version></dependency><!-- https://mvnrepository.com/artifact/org.apache.flink/flink-shaded-guava --><dependency><groupId>org.apache.flink</groupId><artifactId>flink-shaded-guava</artifactId><version>30.1.1-jre-16.1</version></dependency><!-- https://mvnrepository.com/artifact/org.apache.flink/flink-connector-base --><dependency><groupId>org.apache.flink</groupId><artifactId>flink-connector-base</artifactId><version>1.17.1</version></dependency><!-- https://mvnrepository.com/artifact/org.apache.flink/flink-table-planner --><dependency><groupId>org.apache.flink</groupId><artifactId>flink-table-planner_2.12</artifactId><version>1.17.1</version></dependency>
Example Code
MySqlSource<String>mySqlSource=MySqlSource.<String>builder().hostname("192.168.56.107").port(3306).databaseList("test")// set captured database.tableList("test.table_a")// set captured table.username("root").password("mysql").deserializer(newJsonDebeziumDeserializationSchema())// converts SourceRecord to JSON String.serverTimeZone("UTC").build();StreamExecutionEnvironmentenv=StreamExecutionEnvironment.getExecutionEnvironment();// enable checkpointenv.enableCheckpointing(3000);env.fromSource(mySqlSource,WatermarkStrategy.noWatermarks(),"MySQL Source")// set 4 parallel source tasks.setParallelism(4).print().setParallelism(1);// use parallelism 1 for sink to keep message orderingenv.execute("Print MySQL Snapshot + Binlog");
Install a Pod Network Add-on
You can install a network add-on like Flannel, Calico, or Weave. For example, to install Calico:
Join Worker Nodes to the Cluster
On each worker node, run the kubeadm join command provided at the end of the kubeadm init output on the master node. It will look something like this:
--ulimit nofile=262144:262144: 262144 is the maximum users process or for showing maximum user process limit for the logged-in user
ulimit is admin access required Linux shell command which is used to see, set, or limit the resource usage of the current user. It is used to return the number of open file descriptors for each process. It is also used to set restrictions on the resources used by a process.
you need to rename clusters.cluster.certificate-authority, clusters.cluster.server, users.user.client-certificate, users.user.client-key.
clusters.cluster.certificate-authority -> clusters.cluster.certificate-authority-data
clusters.cluster.server -> ip set to `localhost`
users.user.client-certificate -> users.user.client-certificate-data
users.user.client-key -> users.user.client-key-data
the data you paste after each key should be base64
cat <$file_path> | base64
then, modified config file should be look like this:
import(_"k8s.io/client-go/plugin/pkg/client/auth"ctrl"sigs.k8s.io/controller-runtime")// nolint:gocyclofuncmain(){...mgr,err:=ctrl.NewManager(ctrl.GetConfigOrDie(),ctrl.Options{}...iferr=(&controller.GuestbookReconciler{Client:mgr.GetClient(),Scheme:mgr.GetScheme(),}).SetupWithManager(mgr);err!=nil{setupLog.Error(err,"unable to create controller","controller","Guestbook")os.Exit(1)}...ifos.Getenv("ENABLE_WEBHOOKS")!="false"{iferr=webhookwebappv1.SetupGuestbookWebhookWithManager(mgr);err!=nil{setupLog.Error(err,"unable to create webhook","webhook","Guestbook")os.Exit(1)}}
Kubernetes API Server 通过 HTTP 长连接 推送资源变更事件,client-go 的 Informer 负责监听这些消息。
Event:事件是Kubernetes API Server与Controller之间传递的信息,包含资源类型、资源名称、事件类型(ADDED、MODIFIED、DELETED)等信息,并转换成requets, check link
API Server → Manager的Informer → Cache → Controller的Watch → Predicate过滤 → WorkQueue → Controller的Reconcile()方法
Controller
It’s a controller’s job to ensure that, for any given object the actual state of the world matches the desired state in the object. Each controller focuses on one root Kind, but may interact with other Kinds.
you can moidfy file /~/projects/guestbook/api/v1/guestbook_types.go
typeGuestbookSpecstruct{// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster// Important: Run "make" to regenerate code after modifying this file// Foo is an example field of Guestbook. Edit guestbook_types.go to remove/updateFoostring`json:"foo,omitempty"`}
which will corresponding to the file /~/projects/guestbook/config/samples/webapp_v1_guestbook.yaml
If you are editing the API definitions, generate the manifests such as Custom Resources (CRs) or Custom Resource Definitions (CRDs) using
make manifests
you can moidfy file /~/projects/guestbook/internal/controller/guestbook_controller.go
// "fmt"// "k8s.io/apimachinery/pkg/api/errors"// "k8s.io/apimachinery/pkg/types"// appsv1 "k8s.io/api/apps/v1"// corev1 "k8s.io/api/core/v1"// metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"func(r*GuestbookReconciler)Reconcile(ctxcontext.Context,reqctrl.Request)(ctrl.Result,error){// The context is used to allow cancellation of requests, and potentially things like tracing. _=log.FromContext(ctx)fmt.Printf("I am a controller ->>>>>>")fmt.Printf("Name: %s, Namespace: %s",req.Name,req.Namespace)guestbook:=&webappv1.Guestbook{}iferr:=r.Get(ctx,req.NamespacedName,guestbook);err!=nil{returnctrl.Result{},err}fooString:=guestbook.Spec.Fooreplicas:=int32(1)fmt.Printf("Foo String: %s",fooString)// labels := map[string]string{// "app": req.Name,// }// dep := &appsv1.Deployment{// ObjectMeta: metav1.ObjectMeta{// Name: fooString + "-deployment",// Namespace: req.Namespace,// Labels: labels,// },// Spec: appsv1.DeploymentSpec{// Replicas: &replicas,// Selector: &metav1.LabelSelector{// MatchLabels: labels,// },// Template: corev1.PodTemplateSpec{// ObjectMeta: metav1.ObjectMeta{// Labels: labels,// },// Spec: corev1.PodSpec{// Containers: []corev1.Container{{// Name: fooString,// Image: "busybox:latest",// }},// },// },// },// }// existingDep := &appsv1.Deployment{}// err := r.Get(ctx, types.NamespacedName{Name: dep.Name, Namespace: dep.Namespace}, existingDep)// if err != nil {// if errors.IsNotFound(err) {// if err := r.Create(ctx, dep); err != nil {// return ctrl.Result{}, err// }// } else {// return ctrl.Result{}, err// }// }returnctrl.Result{},nil}
And you can use make run to test your controller.
make run
and use following command to send a request
make sure you install crds -> make install before you exec this following command
make install
kubectl apply -k config/samples/
your controller terminal should be look like this
I am a controller ->>>>>>Name: guestbook-sample, Namespace: defaultFoo String: foo-value
Install CRDs
check installed crds in k8s
kubectl get crds
install guestbook crd in k8s
cd ~/projects/guestbook
make install
uninstall CRDs
make uninstall
make undeploy
Deploy to cluster
make docker-build IMG=aaron666/guestbook-operator:test
make docker-build docker-push IMG=<some-registry>/<project-name>:tag
make deploy IMG=<some-registry>/<project-name>:tag
➜ ~ kubectl kubevpn connect
Password:
Starting connect
Getting network CIDR from cluster info...
Getting network CIDR from CNI...
Getting network CIDR from services...
Labeling Namespace default
Creating ServiceAccount kubevpn-traffic-manager
Creating Roles kubevpn-traffic-manager
Creating RoleBinding kubevpn-traffic-manager
Creating Service kubevpn-traffic-manager
Creating MutatingWebhookConfiguration kubevpn-traffic-manager
Creating Deployment kubevpn-traffic-manager
Pod kubevpn-traffic-manager-66d969fd45-9zlbp is Pending
Container Reason Message
control-plane ContainerCreating
vpn ContainerCreating
webhook ContainerCreating
Pod kubevpn-traffic-manager-66d969fd45-9zlbp is Running
Container Reason Message
control-plane ContainerRunning
vpn ContainerRunning
webhook ContainerRunning
Forwarding port...
Connected tunnel
Adding route...
Configured DNS service
+----------------------------------------------------------+
| Now you can access resources in the kubernetes cluster ! |
+----------------------------------------------------------+
already connected to cluster network, use command kubectl kubevpn status to check status
➜ ~ kubectl kubevpn status
ID Mode Cluster Kubeconfig Namespace Status Netif
0 full ops-dev /root/.kube/zverse_config data-and-computing Connected utun0
use pod productpage-788df7ff7f-jpkcs IP 172.29.2.134
1. If you have only one node in your cluster, you need at least 6 CPUs, 6 GB of memory, and 30 GB of disk storage.2. If you have multiple nodes in your cluster, for each node you need at least 2 CPUs, 4 GB of memory, and 20 GB of disk storage.
kubectl apply -f - <<EOFapiVersion:operator.knative.dev/v1beta1kind:KnativeServingmetadata:name:knative-servingnamespace:knative-servingspec:version:1.18.0# this is knative serving versionconfig:domain:example.com:""EOF
Internal error occurred: failed calling webhook "clusterservingruntime.kserve-webhook-server.validator": failed to call webhook: Post "https://kserve-webhook-server-service.kserve.svc:443/validate-serving-kserve-io-v1alpha1-clusterservingruntime?timeout=10s": no endpoints available for service "kserve-webhook-server-service" Running 114s
Just wait for a while and the resync, and it will be fine.
kubectl -n kserve-test get inferenceservices first-torchserve
kubectl -n kserve-test get pod
#NAME READY STATUS RESTARTS AGE#first-torchserve-predictor-00001-deplo... 2/2 Running 0 25skubectl -n kserve-test get inferenceservices first-torchserve
#NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE#kserve-test first-torchserve http://first-torchserve.kserve-test.example.com True 100 first-torchserve-predictor-00001 2m59s
After all pods are ready, you can access the service by using the following command
Access By
If the EXTERNAL-IP value is set, your environment has an external load balancer that you can use for the ingress gateway.
exportINGRESS_HOST=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.status.loadBalancer.ingress[0].ip}')exportINGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=="http2")].nodePort}')
If the EXTERNAL-IP value is none (or perpetually pending), your environment does not provide an external load balancer for the ingress gateway. In this case, you can access the gateway using the service’s node port.
exportINGRESS_HOST=$(minikube ip)exportINGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=="http2")].nodePort}')
* Trying 192.168.58.2...
* TCP_NODELAY set
* Connected to 192.168.58.2 (192.168.58.2) port 32132 (#0)
> POST /v1/models/mnist:predict HTTP/1.1
> Host: my-torchserve.kserve-test.example.com
> User-Agent: curl/7.61.1
> Accept: */*
> Content-Type: application/json
> Content-Length: 401
>
* upload completely sent off: 401 out of 401 bytes
< HTTP/1.1 200 OK
< content-length: 19
< content-type: application/json
< date: Mon, 09 Jun 2025 09:27:27 GMT
< server: istio-envoy
< x-envoy-upstream-service-time: 1128
<
* Connection #0 to host 192.168.58.2 left intact
{"predictions":[2]}
First Custom Model
AlexNet Inference
More Information about AlexNet service can be found 🔗link
Implement Custom Model using KServe API
1importargparse 2importbase64 3importio 4importtime 5 6fromfastapi.middleware.corsimportCORSMiddleware 7fromtorchvisionimportmodels,transforms 8fromtypingimportDict 9importtorch10fromPILimportImage1112importkserve13fromkserveimportModel,ModelServer,logging14fromkserve.model_serverimportapp15fromkserve.utils.utilsimportgenerate_uuid161718classAlexNetModel(Model):19def__init__(self,name:str):20super().__init__(name,return_response_headers=True)21self.name=name22self.load()23self.ready=False2425defload(self):26self.model=models.alexnet(pretrained=True)27self.model.eval()28# The ready flag is used by model ready endpoint for readiness probes,29# set to True when model is loaded successfully without exceptions.30self.ready=True3132asyncdefpredict(33self,34payload:Dict,35headers:Dict[str,str]=None,36response_headers:Dict[str,str]=None,37)->Dict:38start=time.time()39# Input follows the Tensorflow V1 HTTP API for binary values40# https://www.tensorflow.org/tfx/serving/api_rest#encoding_binary_values41img_data=payload["instances"][0]["image"]["b64"]42raw_img_data=base64.b64decode(img_data)43input_image=Image.open(io.BytesIO(raw_img_data))44preprocess=transforms.Compose([45transforms.Resize(256),46transforms.CenterCrop(224),47transforms.ToTensor(),48transforms.Normalize(mean=[0.485,0.456,0.406],49std=[0.229,0.224,0.225]),50])51input_tensor=preprocess(input_image).unsqueeze(0)52output=self.model(input_tensor)53torch.nn.functional.softmax(output,dim=1)54values,top_5=torch.topk(output,5)55result=values.flatten().tolist()56end=time.time()57response_id=generate_uuid()5859# Custom response headers can be added to the inference response60ifresponse_headersisnotNone:61response_headers.update(62{"prediction-time-latency":f"{round((end-start)*1000,9)}"}63)6465return{"predictions":result}666768parser=argparse.ArgumentParser(parents=[kserve.model_server.parser])69args,_=parser.parse_known_args()7071if__name__=="__main__":72# Configure kserve and uvicorn logger73ifargs.configure_logging:74logging.configure_logging(args.log_config_file)75model=AlexNetModel(args.model_name)76model.load()77# Custom middlewares can be added to the model78app.add_middleware(79CORSMiddleware,80allow_origins=["*"],81allow_credentials=True,82allow_methods=["*"],83allow_headers=["*"],84)85ModelServer().start([model])
kubectl -n kserve-test get inferenceservices ay-custom-model
kubectl -n kserve-test get pod
#NAME READY STATUS RESTARTS AGE#ay-custom-model-predictor-00003-dcf4rk 2/2 Running 0 167mkubectl -n kserve-test get inferenceservices ay-custom-model
#NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE#ay-custom-model http://ay-custom-model.kserve-test.example.com True 100 ay-custom-model-predictor-00003 177m
After all pods are ready, you can access the service by using the following command
Access By
If the EXTERNAL-IP value is set, your environment has an external load balancer that you can use for the ingress gateway.
exportINGRESS_HOST=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.status.loadBalancer.ingress[0].ip}')exportINGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=="http2")].nodePort}')
If the EXTERNAL-IP value is none (or perpetually pending), your environment does not provide an external load balancer for the ingress gateway. In this case, you can access the gateway using the service’s node port.
exportINGRESS_HOST=$(minikube ip)exportINGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=="http2")].nodePort}')
* Trying 192.168.58.2:30704...
* Connected to 192.168.58.2 (192.168.58.2) port 30704
> POST /v1/models/custom-model:predict HTTP/1.1
> Host: ay-custom-model.kserve-test.example.com
> User-Agent: curl/8.5.0
> Accept: */*
> Content-Type: application/json
> Content-Length: 105339
>
* We are completely uploaded and fine
< HTTP/1.1 200 OK
< content-length: 110
< content-type: application/json
< date: Wed, 11 Jun 2025 03:38:30 GMT
< prediction-time-latency: 89.966773987
< server: istio-envoy
< x-envoy-upstream-service-time: 93
<
* Connection #0 to host 192.168.58.2 left intact
{"predictions":[14.975619316101074,14.0368070602417,13.966034889221191,12.252280235290527,12.086270332336426]}
First Model In Minio
Inference Model In Minio
More Information about Deploy InferenceService with a saved model on S3 can be found 🔗link
Create Service Account
=== “yaml”
apiVersion:v1kind:ServiceAccountmetadata:name:saannotations:eks.amazonaws.com/role-arn:arn:aws:iam::123456789012:role/s3access# replace with your IAM role ARNserving.kserve.io/s3-endpoint:s3.amazonaws.com# replace with your s3 endpoint e.g minio-service.kubeflow:9000serving.kserve.io/s3-usehttps:"1"# by default 1, if testing with minio you can set to 0serving.kserve.io/s3-region:"us-east-2"serving.kserve.io/s3-useanoncredential:"false"# omitting this is the same as false, if true will ignore provided credential and use anonymous credentials
=== “kubectl”
kubectl apply -f create-s3-sa.yaml
Create S3 Secret and attach to Service Account
Create a secret with your S3 user credential, KServe reads the secret annotations to inject the S3 environment variables on storage initializer or model agent to download the models from S3 storage.
Create S3 secret
=== “yaml”
apiVersion:v1kind:Secretmetadata:name:s3credsannotations:serving.kserve.io/s3-endpoint:s3.amazonaws.com# replace with your s3 endpoint e.g minio-service.kubeflow:9000serving.kserve.io/s3-usehttps:"1"# by default 1, if testing with minio you can set to 0serving.kserve.io/s3-region:"us-east-2"serving.kserve.io/s3-useanoncredential:"false"# omitting this is the same as false, if true will ignore provided credential and use anonymous credentialstype:OpaquestringData:# use `stringData` for raw credential string or `data` for base64 encoded stringAWS_ACCESS_KEY_ID:XXXXAWS_SECRET_ACCESS_KEY:XXXXXXXX
!!! note
If you are running kserve with istio sidecars enabled, there can be a race condition between the istio proxy being ready and the agent pulling models. This will result in a tcp dial connection refused error when the agent tries to download from s3.
To resolve it, istio allows the blocking of other containers in a pod until the proxy container is ready.
You can enabled this by setting `proxy.holdApplicationUntilProxyStarts: true` in `istio-sidecar-injector` configmap, `proxy.holdApplicationUntilProxyStarts` flag was introduced in Istio 1.7 as an experimental feature and is turned off by default.
Deploy the model on S3 with InferenceService
Create the InferenceService with the s3 storageUri and the service account with s3 credential attached.
```{ .bash .no-copy }
Note: Unnecessary use of -X or --request, POST is already inferred.
* Trying 35.237.217.209...
* TCP_NODELAY set
* Connected to mnist-s3.default.35.237.217.209.xip.io (35.237.217.209) port 80 (#0)
> POST /v1/models/mnist-s3:predict HTTP/1.1
> Host: mnist-s3.default.35.237.217.209.xip.io
> User-Agent: curl/7.55.1
> Accept: */*
> Content-Length: 2052
> Content-Type: application/x-www-form-urlencoded
> Expect: 100-continue
>
< HTTP/1.1 100 Continue
* We are completely uploaded and fine
< HTTP/1.1 200 OK
< content-length: 251
< content-type: application/json
< date: Sun, 04 Apr 2021 20:06:27 GMT
< x-envoy-upstream-service-time: 5
< server: istio-envoy
<
* Connection #0 to host mnist-s3.default.35.237.217.209.xip.io left intact
{
"predictions": [
{
"predictions": [0.327352405, 2.00153053e-07, 0.0113353515, 0.203903764, 3.62863029e-05, 0.416683704, 0.000281196437, 8.36911859e-05, 0.0403052084, 1.82206513e-05],
"classes": 5
}
]
}
```
Kafka Sink Transformer
AlexNet Inference
More Information about Custom Transformer service can be found 🔗link
Implement Custom Transformer ./model.py using Kserve API
1importos 2importargparse 3importjson 4 5fromtypingimportDict,Union 6fromkafkaimportKafkaProducer 7fromcloudevents.httpimportCloudEvent 8fromcloudevents.conversionimportto_structured 910fromkserveimport(11Model,12ModelServer,13model_server,14logging,15InferRequest,16InferResponse,17)1819fromkserve.loggingimportlogger20fromkserve.utils.utilsimportgenerate_uuid2122kafka_producer=KafkaProducer(23value_serializer=lambdav:json.dumps(v).encode('utf-8'),24bootstrap_servers=os.environ.get('KAFKA_BOOTSTRAP_SERVERS','localhost:9092')25)2627classImageTransformer(Model):28def__init__(self,name:str):29super().__init__(name,return_response_headers=True)30self.ready=True313233defpreprocess(34self,payload:Union[Dict,InferRequest],headers:Dict[str,str]=None35)->Union[Dict,InferRequest]:36logger.info("Received inputs %s",payload)37logger.info("Received headers %s",headers)38self.request_trace_key=os.environ.get('REQUEST_TRACE_KEY','algo.trace.requestId')39ifself.request_trace_keynotinpayload:40logger.error("Request trace key '%s' not found in payload, you cannot trace the prediction result",self.request_trace_key)41if"instances"notinpayload:42raiseValueError(43f"Request trace key '{self.request_trace_key}' not found in payload and 'instances' key is missing."44)45else:46headers[self.request_trace_key]=payload.get(self.request_trace_key)4748return{"instances":payload["instances"]}4950defpostprocess(51self,52infer_response:Union[Dict,InferResponse],53headers:Dict[str,str]=None,54response_headers:Dict[str,str]=None,55)->Union[Dict,InferResponse]:56logger.info("postprocess headers: %s",headers)57logger.info("postprocess response headers: %s",response_headers)58logger.info("postprocess response: %s",infer_response)5960attributes={61"source":"data-and-computing/kafka-sink-transformer",62"type":"org.zhejianglab.zverse.data-and-computing.kafka-sink-transformer",63"request-host":headers.get('host','unknown'),64"kserve-isvc-name":headers.get('kserve-isvc-name','unknown'),65"kserve-isvc-namespace":headers.get('kserve-isvc-namespace','unknown'),66self.request_trace_key:headers.get(self.request_trace_key,'unknown'),67}6869_,cloudevent=to_structured(CloudEvent(attributes,infer_response))70try:71kafka_producer.send(os.environ.get('KAFKA_TOPIC','test-topic'),value=cloudevent.decode('utf-8').replace("'",'"'))72kafka_producer.flush()73exceptExceptionase:74logger.error("Failed to send message to Kafka: %s",e)75returninfer_response7677parser=argparse.ArgumentParser(parents=[model_server.parser])78args,_=parser.parse_known_args()7980if__name__=="__main__":81ifargs.configure_logging:82logging.configure_logging(args.log_config_file)83logging.logger.info("available model name: %s",args.model_name)84logging.logger.info("all args: %s",args.model_name)85model=ImageTransformer(args.model_name)86ModelServer().start([model])
modify ./pyproject.toml
[tool.poetry]name="custom_transformer"version="0.15.2"description="Custom Transformer Examples. Not intended for use outside KServe Frameworks Images."authors=["Dan Sun <dsun20@bloomberg.net>"]license="Apache-2.0"packages=[{include="*.py"}][tool.poetry.dependencies]python=">=3.9,<3.13"kserve={path="../kserve",develop=true}pillow="^10.3.0"kafka-python="^2.2.15"cloudevents="^1.11.1"[[tool.poetry.source]]name="pytorch"url="https://download.pytorch.org/whl/cpu"priority="explicit"[tool.poetry.group.test]optional=true[tool.poetry.group.test.dependencies]pytest="^7.4.4"mypy="^0.991"[tool.poetry.group.dev]optional=true[tool.poetry.group.dev.dependencies]black={version="~24.3.0",extras=["colorama"]}[tool.poetry-version-plugin]source="file"file_path="../VERSION"[build-system]requires=["poetry-core>=1.0.0"]build-backend="poetry.core.masonry.api"
kubectl -n kserve-test get inferenceservices sklearn-iris
kubectl -n istio-system get svc istio-ingressgateway
exportINGRESS_HOST=$(minikube ip)exportINGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway -o jsonpath='{.spec.ports[?(@.name=="http2")].nodePort}')
KServe supports canary rollouts for inference services. Canary rollouts allow for a new version of an InferenceService to receive a percentage of traffic. Kserve supports a configurable canary rollout strategy with multiple steps. The rollout strategy can also be implemented to rollback to the previous revision if a rollout step fails.
KServe automatically tracks the last good revision that was rolled out with 100% traffic. The canaryTrafficPercent field in the component’s spec needs to be set with the percentage of traffic that should be routed to the new revision. KServe will then automatically split the traffic between the last good revision and the revision that is currently being rolled out according to the canaryTrafficPercent value.
When the first revision of an InferenceService is deployed, it will receive 100% of the traffic. When multiple revisions are deployed, as in step 2, and the canary rollout strategy is configured to route 10% of the traffic to the new revision, 90% of the traffic will go to the LastestRolledoutRevision. If there is an unhealthy or bad revision applied, traffic will not be routed to that bad revision. In step 3, the rollout strategy promotes the LatestReadyRevision from step 2 to the LatestRolledoutRevision. Since it is now promoted, the LatestRolledoutRevision gets 100% of the traffic and is fully rolled out. If a rollback needs to happen, 100% of the traffic will be pinned to the previous healthy/good revision- the PreviousRolledoutRevision.
After rolling out the canary model, traffic is split between the latest ready revision 2 and the previously rolled out revision 1.
kubectl -n kserve-test get isvc sklearn-iris
NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE
sklearn-iris http://sklearn-iris.kserve-test.example.com True 90 10 sklearn-iris-predictor-00002 sklearn-iris-predictor-00003 19h
Check the running pods, you should now see port two pods running for the old and new model and 10% traffic is routed to
the new model. Notice revision 1 contains 0002 in its name, while revision 2 contains 0003.
kubectl get pods
NAME READY STATUS RESTARTS AGE
sklearn-iris-predictor-00002-deployment-c7bb6c685-ktk7r 2/2 Running 0 71m
sklearn-iris-predictor-00003-deployment-8498d947-fpzcg 2/2 Running 0 20m
Now all traffic goes to the revision 2 for the new model.
kubectl get isvc sklearn-iris
NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE
sklearn-iris http://sklearn-iris.kserve-test.example.com True 100 sklearn-iris-predictor-00002 17m
The pods for revision generation 1 automatically scales down to 0 as it is no longer getting the traffic.
kubectl get pods -l serving.kserve.io/inferenceservice=sklearn-iris
NAME READY STATUS RESTARTS AGE
sklearn-iris-predictor-00001-deployment-66c5f5b8d5-gmfvj 1/2 Terminating 0 17m
sklearn-iris-predictor-00002-deployment-5bd9ff46f8-shtzd 2/2 Running 0 15m
Rollback and pin the previous model
You can pin the previous model (model v1, for example) by setting the canaryTrafficPercent to 0 for the current
model (model v2, for example). This rolls back from model v2 to model v1 and decreases model v2’s traffic to zero.
Apply the custom resource to set model v2’s traffic to 0%.
Check the traffic split, now 100% traffic goes to the previous good model (model v1) for revision generation 1.
kubectl get isvc sklearn-iris
NAME URL READY PREV LATEST PREVROLLEDOUTREVISION LATESTREADYREVISION AGE
sklearn-iris http://sklearn-iris.kserve-test.example.com True 1000 sklearn-iris-predictor-00002 sklearn-iris-predictor-00003 18m
The pods for previous revision (model v1) now routes 100% of the traffic to its pods while the new
model (model v2) routes 0% traffic to its pods.
kubectl get pods -l serving.kserve.io/inferenceservice=sklearn-iris
NAME READY STATUS RESTARTS AGE
sklearn-iris-predictor-00002-deployment-66c5f5b8d5-gmfvj 1/2 Running 0 35s
sklearn-iris-predictor-00003-deployment-5bd9ff46f8-shtzd 2/2 Running 0 16m
Route traffic using a tag
You can enable tag based routing by adding the annotation serving.kserve.io/enable-tag-routing, so traffic can be
explicitly routed to the canary model (model v2) or the old model (model v1) via a tag in the request URL.
Apply model v2 with canaryTrafficPercent: 10 and serving.kserve.io/enable-tag-routing: "true".
Since we updated the annotation on the InferenceService, model v2 now corresponds to sklearn-iris-predictor--00003.
You can now send the request explicitly to the new model or the previous model by using the tag in the request URL. Use
the curl command
from Perform inference and
add latest- or prev- to the model name to send a tag based request.
For example, set the model name and use the following commands to send traffic to each service based on the latest or prev tag.
You can configure InferenceService with annotation autoscaling.knative.dev/target for a soft limit. The soft limit is a targeted limit rather than a strictly enforced bound, particularly if there is a sudden burst of requests, this value can be exceeded.
You can also configure InferenceService with field containerConcurrency with a hard limit. The hard limit is an enforced upper bound. If concurrency reaches the hard limit, surplus requests will be buffered and must wait until enough capacity is free to execute the requests.
configmap/kafka-broker-config created
configmap/kafka-channel-config created
customresourcedefinition.apiextensions.k8s.io/kafkachannels.messaging.knative.dev created
customresourcedefinition.apiextensions.k8s.io/consumers.internal.kafka.eventing.knative.dev created
customresourcedefinition.apiextensions.k8s.io/consumergroups.internal.kafka.eventing.knative.dev created
customresourcedefinition.apiextensions.k8s.io/kafkasinks.eventing.knative.dev created
customresourcedefinition.apiextensions.k8s.io/kafkasources.sources.knative.dev created
clusterrole.rbac.authorization.k8s.io/eventing-kafka-source-observer created
configmap/config-kafka-source-defaults created
configmap/config-kafka-autoscaler created
configmap/config-kafka-features created
configmap/config-kafka-leader-election created
configmap/kafka-config-logging created
configmap/config-namespaced-broker-resources created
configmap/config-tracing configured
clusterrole.rbac.authorization.k8s.io/knative-kafka-addressable-resolver created
clusterrole.rbac.authorization.k8s.io/knative-kafka-channelable-manipulator created
clusterrole.rbac.authorization.k8s.io/kafka-controller created
serviceaccount/kafka-controller created
clusterrolebinding.rbac.authorization.k8s.io/kafka-controller created
clusterrolebinding.rbac.authorization.k8s.io/kafka-controller-addressable-resolver created
deployment.apps/kafka-controller created
clusterrole.rbac.authorization.k8s.io/kafka-webhook-eventing created
serviceaccount/kafka-webhook-eventing created
clusterrolebinding.rbac.authorization.k8s.io/kafka-webhook-eventing created
mutatingwebhookconfiguration.admissionregistration.k8s.io/defaulting.webhook.kafka.eventing.knative.dev created
mutatingwebhookconfiguration.admissionregistration.k8s.io/pods.defaulting.webhook.kafka.eventing.knative.dev created
secret/kafka-webhook-eventing-certs created
validatingwebhookconfiguration.admissionregistration.k8s.io/validation.webhook.kafka.eventing.knative.dev created
deployment.apps/kafka-webhook-eventing created
service/kafka-webhook-eventing created
configmap/config-kafka-channel-data-plane created
clusterrole.rbac.authorization.k8s.io/knative-kafka-channel-data-plane created
serviceaccount/knative-kafka-channel-data-plane created
clusterrolebinding.rbac.authorization.k8s.io/knative-kafka-channel-data-plane created
statefulset.apps/kafka-channel-dispatcher created
deployment.apps/kafka-channel-receiver created
service/kafka-channel-ingress created
configmap/config-kafka-broker-data-plane created
clusterrole.rbac.authorization.k8s.io/knative-kafka-broker-data-plane created
serviceaccount/knative-kafka-broker-data-plane created
clusterrolebinding.rbac.authorization.k8s.io/knative-kafka-broker-data-plane created
statefulset.apps/kafka-broker-dispatcher created
deployment.apps/kafka-broker-receiver created
service/kafka-broker-ingress created
flowchart LR
A[Curl] -->|HTTP| B{Broker}
B -->|Subscribe| D[Trigger1]
B -->|Subscribe| E[Trigger2]
B -->|Subscribe| F[Trigger3]
E --> G[Display Service]
Setps
1. Create Broker Setting
kubectl apply -f - <<EOFapiVersion:v1kind:ConfigMapmetadata:name:kafka-broker-confignamespace:knative-eventingdata:default.topic.partitions:"10"default.topic.replication.factor:"1"bootstrap.servers:"kafka.database.svc.cluster.local:9092"#kafka service addressdefault.topic.config.retention.ms:"3600"EOF
flowchart LR
A[User Curl] -->|HTTP| B{ISVC-Broker:Kafka}
B -->|Subscribe| D[Trigger1]
B -->|Subscribe| E[Kserve-Triiger]
B -->|Subscribe| F[Trigger3]
E --> G[Mnist Service]
G --> |Kafka-Sink| B
Setps
1. Create Broker Setting
kubectl apply -f - <<EOFapiVersion:v1kind:ConfigMapmetadata:name:kafka-broker-confignamespace:knative-eventingdata:default.topic.partitions:"10"default.topic.replication.factor:"1"bootstrap.servers:"kafka.database.svc.cluster.local:9092"#kafka service addressdefault.topic.config.retention.ms:"3600"EOF
kubectl apply -f - << EOFapiVersion:eventing.knative.dev/v1kind:Triggermetadata:name:kserve-triggernamespace:kserve-testspec:broker:isvc-brokerfilter:attributes:type:prediction-request-udf-attr# you can change thissubscriber:uri:http://prediction-and-sink.kserve-test.svc.cluster.local/v1/models/mnist:predictEOF
4. Create InferenceService
1kubectl apply -f - <<EOF 2apiVersion:serving.kserve.io/v1beta1 3kind:InferenceService 4metadata: 5name:prediction-and-sink 6namespace:kserve-test 7spec: 8predictor: 9model:10modelFormat:11name:pytorch12storageUri:gs://kfserving-examples/models/torchserve/image_classifier/v113transformer:14containers:15- image:docker-registry.lab.zverse.space/data-and-computing/ay-dev/msg-transformer:dev916name:kserve-container17env:18- name:KAFKA_BOOTSTRAP_SERVERS19value:kafka.database.svc.cluster.local20- name:KAFKA_TOPIC21value:test-topic# result will be saved in this topic22- name:REQUEST_TRACE_KEY23value:test-trace-id# using this key to retrieve preidtion result24command:25- "python"26- "-m"27- "model"28args:29- --model_name30- mnist31EOF
root@ay-k3s01:~# kubectl -n kserve-test get pod
NAME READY STATUS RESTARTS AGE
prediction-and-sink-predictor-00001-deployment-f64bb76f-jqv4m 2/2 Running 0 3m46s
prediction-and-sink-transformer-00001-deployment-76cccd867lksg9 2/2 Running 0 4m3s
Source code of the docker-registry.lab.zverse.space/data-and-computing/ay-dev/msg-transformer:dev9 could be found 🔗here
{"test-trace-id":"16ec3446-48d6-422e-9926-8224853e84a7","instances":[{"data":"iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAAAAABXZoBIAAAAw0lEQVR4nGNgGFggVVj4/y8Q2GOR83n+58/fP0DwcSqmpNN7oOTJw6f+/H2pjUU2JCSEk0EWqN0cl828e/FIxvz9/9cCh1zS5z9/G9mwyzl/+PNnKQ45nyNAr9ThMHQ/UG4tDofuB4bQIhz6fIBenMWJQ+7Vn7+zeLCbKXv6z59NOPQVgsIcW4QA9YFi6wNQLrKwsBebW/68DJ388Nun5XFocrqvIFH59+XhBAxThTfeB0r+vP/QHbuDCgr2JmOXoSsAAKK7bU3vISS4AAAAAElFTkSuQmCC"}]}{"predictions":[2]// result will be saved in this topic as well
}
1. Kubernetes has installed, if not check 🔗link2. ArgoCD has installed, if not check 🔗link3. Helm binary has installed, if not check 🔗link4. Ingres has installed on argoCD, if not check 🔗link5. Minio has installed, if not check 🔗link
1. Kubernetes has installed, if not check 🔗link2. Helm has installed, if not check 🔗link3. Cert-manager has installed and the clusterissuer has a named self-signed-ca-issuer service, , if not check 🔗link
1. Kubernetes has installed, if not check 🔗link2. ArgoCD has installed, if not check 🔗link3. Cert-manager has installed on argocd and the clusterissuer has a named self-signed-ca-issuer service , if not check 🔗link4. Ingres has installed on argoCD, if not check 🔗link
exportDESTINATION_GATEKEEPER_IMAGE=<add registry like "myregistry.docker.io/gatekeeper">
make docker-buildx REPOSITORY=$DESTINATION_GATEKEEPER_IMAGEOUTPUT_TYPE=type=registry
And the deploy
make deploy REPOSITORY=$DESTINATION_GATEKEEPER_IMAGE
1. Kubernetes has installed, if not check 🔗link2. Helm has installed, if not check 🔗link
Preliminary
1. Kubernetes has installed, if not check 🔗link2. argoCD has installed, if not check 🔗link3. cert-manager has installed on argocd and the clusterissuer has a named `self-signed-ca-issuer`service, , if not check 🔗link
1. Kubernetes has installed, if not check 🔗link2. Helm has installed, if not check 🔗link
Preliminary
1. Kubernetes has installed, if not check 🔗link2. argoCD has installed, if not check 🔗link3. cert-manager has installed on argocd and the clusterissuer has a named `self-signed-ca-issuer`service, , if not check 🔗link
1. Kubernetes has installed, if not check 🔗link2. Helm has installed, if not check 🔗link3. ArgoCD has installed, if not check 🔗link4. Argo Workflow has installed, if not check 🔗link
1. Kubernetes has installed, if not check 🔗link2. Helm has installed, if not check 🔗link3. ArgoCD has installed, if not check 🔗link4. Argo Workflow has installed, if not check 🔗link5. Minio artifact repository has been configured, if not check 🔗link- endpoint: minio.storage:9000
podman run --rm \
-it docker.io/library/redis:7.2.4-alpine \
redis-cli \
-h host.containers.internal \
set mykey somevalue
Preliminary
1. Kubernetes has installed, if not check 🔗link2. Helm has installed, if not check 🔗link3. ArgoCD has installed, if not check 🔗link4. Argo Workflow has installed, if not check 🔗link5. Minio artifact repository has been configured, if not check 🔗link- endpoint: minio.storage:9000
1. Kubernetes has installed, if not check 🔗link2. Helm has installed, if not check 🔗link3. ArgoCD has installed, if not check 🔗link4. Argo Workflow has installed, if not check 🔗link
This guide gets you started with gRPC in C++ with a simple working example.
In the C++ world, there’s no universally accepted standard for managing project dependencies. You need to build and install gRPC before building and running this quick start’s Hello World example.
Build and locally install gRPC and Protocol Buffers.
The steps in the section explain how to build and locally install gRPC and Protocol Buffers using cmake. If you’d rather use bazel, see Building from source.
1. Setup
Choose a directory to hold locally installed packages. This page assumes that the environment variable MY_INSTALL_DIR holds this directory path. For example:
exportMY_INSTALL_DIR=$HOME/.local
Ensure that the directory exists:
mkdir -p $MY_INSTALL_DIR
Add the local bin folder to your path variable, for example:
exportPATH="$MY_INSTALL_DIR/bin:$PATH"
Important
We strongly encourage you to install gRPC locally — using an appropriately set CMAKE_INSTALL_PREFIX
— because there is no easy way to uninstall gRPC after you’ve installed it globally.
2. Install Essentials
2.1 Install Cmake
You need version 3.13 or later of cmake. Install it by following these instructions:
While not mandatory, gRPC applications usually leverage Protocol Buffers for service definitions and data serialization, and the example code uses proto3.
The following commands build and locally install gRPC and Protocol Buffers:
cd grpc
mkdir -p cmake/build
pushd cmake/build
cmake -DgRPC_INSTALL=ON \
-DgRPC_BUILD_TESTS=OFF \
-DCMAKE_INSTALL_PREFIX=$MY_INSTALL_DIR\
../..
make -j 4make install
popd
3. Run the example
The example code is part of the grpc repo source, which you cloned as part of the steps of the previous section.
3.1 change the example’s directory:
cd examples/cpp/helloworld
3.2 build the example project by using cmake
make sure you still can echo $MY_INSTALL_DIR, and return a valid result
# K8S_MASTER_IP could be you master ip or loadbalancer external ipK8S_MASTER_IP=172.27.253.27
MINIO_ACCESS_SECRET=$(kubectl -n storage get secret minio-secret -o jsonpath='{.data.rootPassword}'| base64 -d)podman run --rm \
--entrypoint bash \
--add-host=minio-api.dev.geekcity.tech:${K8S_MASTER_IP}\
-it docker.io/minio/mc:latest \
-c "mc alias set minio http://minio-api.dev.geekcity.tech admin ${MINIO_ACCESS_SECRET} \
&& mc ls minio \
&& mc mb --ignore-existing minio/argo-workflows-artifacts"
1. Kubernetes has installed, if not check 🔗link2. Helm binary has installed, if not check 🔗link
Preliminary
1. Kubernetes has installed, if not check 🔗link2. ArgoCD has installed, if not check 🔗link3. Ingres has installed on argoCD, if not check 🔗link4. Cert-manager has installed on argocd and the clusterissuer has a named `self-signed-ca-issuer`service, , if not check 🔗link
podman run --rm \
--entrypoint bash \
-it docker.io/minio/mc:latest \
-c "mc alias set minio http://host.docker.internal:9000 minioadmin minioadmin \
&& mc ls minio \
&& mc mb --ignore-existing minio/test \
&& mc cp /etc/hosts minio/test/etc/hosts \
&& mc ls --recursive minio"
Using `systemctl status xxxx` to check if the `xxxx` service is running
```text
# vim /usr/lib/systemd/system/slurmdbd.service
[Unit]
Description=Slurm DBD accounting daemon
After=network-online.target remote-fs.target munge.service mysql.service mysqld.service mariadb.service sssd.service
Wants=network-online.target
ConditionPathExists=/etc/slurm/slurmdbd.conf
[Service]
Type=simple
EnvironmentFile=-/etc/sysconfig/slurmdbd
EnvironmentFile=-/etc/default/slurmdbd
User=slurm
Group=slurm
RuntimeDirectory=slurmdbd
RuntimeDirectoryMode=0755
ExecStart=/usr/local/sbin/slurmdbd -D -s $SLURMDBD_OPTIONS
ExecReload=/bin/kill -HUP $MAINPID
LimitNOFILE=65536
# Uncomment the following lines to disable logging through journald.
# NOTE: It may be preferable to set these through an override file instead.
#StandardOutput=null
#StandardError=null
[Install]
WantedBy=multi-user.target
```
```text
# vim /usr/lib/systemd/system/slurmctld.service
[Unit]
Description=Slurm controller daemon
After=network-online.target remote-fs.target munge.service sssd.service
Wants=network-online.target
ConditionPathExists=/etc/slurm/slurm.conf
[Service]
Type=notify
EnvironmentFile=-/etc/sysconfig/slurmctld
EnvironmentFile=-/etc/default/slurmctld
User=slurm
Group=slurm
RuntimeDirectory=slurmctld
RuntimeDirectoryMode=0755
ExecStart=/usr/local/sbin/slurmctld --systemd $SLURMCTLD_OPTIONS
ExecReload=/bin/kill -HUP $MAINPID
LimitNOFILE=65536
# Uncomment the following lines to disable logging through journald.
# NOTE: It may be preferable to set these through an override file instead.
#StandardOutput=null
#StandardError=null
[Install]
WantedBy=multi-user.target
```
```text
# vim /usr/lib/systemd/system/slurmd.service
[Unit]
Description=Slurm node daemon
After=munge.service network-online.target remote-fs.target sssd.service
Wants=network-online.target
#ConditionPathExists=/etc/slurm/slurm.conf
[Service]
Type=notify
EnvironmentFile=-/etc/sysconfig/slurmd
EnvironmentFile=-/etc/default/slurmd
RuntimeDirectory=slurm
RuntimeDirectoryMode=0755
ExecStart=/usr/local/sbin/slurmd --systemd $SLURMD_OPTIONS
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
LimitNOFILE=131072
LimitMEMLOCK=infinity
LimitSTACK=infinity
Delegate=yes
# Uncomment the following lines to disable logging through journald.
# NOTE: It may be preferable to set these through an override file instead.
#StandardOutput=null
#StandardError=null
[Install]
WantedBy=multi-user.target
```
systemctl start slurmd
systemctl enable slurmd
Using `systemctl status slurmd` to check if the `slurmd` service is running
systemctl start slurmd
systemctl enable slurmd
Using `systemctl status slurmd` to check if the `slurmd` service is running
systemctl start slurmd
systemctl enable slurmd
Using `systemctl status slurmd` to check if the `slurmd` service is running
test slurm
check cluster configuration
scontrol show config
check cluster status
sinfo
scontrol show partition
scontrol show node
submit job
srun -N2 hostname
scontrol show jobs
check job status
squeue -a
Install From Binary
(All) means all type nodes should install this component.
(Mgr) means only the manager node should install this component.
(Auth) means only the Auth node should install this component.
(Cmp) means only the Compute node should install this component.
Typically, there are three nodes are required to run Slurm. 1 Manage(Mgr), 1 Auth and N Compute(Cmp). but you can choose to install all service in single node. check
disable firewall, selinux, dnsmasq, swap (All). more detail here
NFS Server(Mgr). NFS is used as the default file system for the Slurm accounting database.
[NFS Client] (All). all node should mount the NFS share
mount <$nfs_server>:/data /data -o proto=tcp -o nolock
Munge(All). The auth/munge plugin will be built if the MUNGE authentication development library is installed. MUNGE is used as the default authentication mechanism.
Database (Mgr). MySQL support for accounting will be built if the MySQL or MariaDB development library is present. A currently supported version of MySQL or MariaDB should be used.
install mariadb
yum -y install mariadb-server
systemctl start mariadb && systemctl enable mariadb
ROOT_PASS=$(tr -dc A-Za-z0-9 </dev/urandom | head -c 16)mysql -e "CREATE USER root IDENTIFIED BY '${ROOT_PASS}'"
Contains the definition (list) of the nodes that is assigned to the job.
$SLURM_NODELIST
Deprecated. Same as SLURM_JOB_NODELIST.
$SLURM_CPUS_PER_TASK
Number of CPUs per task.
$SLURM_CPUS_ON_NODE
Number of CPUs on the allocated node.
$SLURM_JOB_CPUS_PER_NODE
Count of processors available to the job on this node.
$SLURM_CPUS_PER_GPU
Number of CPUs requested per allocated GPU.
$SLURM_MEM_PER_CPU
Memory per CPU. Same as –mem-per-cpu .
$SLURM_MEM_PER_GPU
Memory per GPU.
$SLURM_MEM_PER_NODE
Memory per node. Same as –mem .
$SLURM_GPUS
Number of GPUs requested.
$SLURM_NTASKS
Same as -n, –ntasks. The number of tasks.
$SLURM_NTASKS_PER_NODE
Number of tasks requested per node.
$SLURM_NTASKS_PER_SOCKET
Number of tasks requested per socket.
$SLURM_NTASKS_PER_CORE
Number of tasks requested per core.
$SLURM_NTASKS_PER_GPU
Number of tasks requested per GPU.
$SLURM_NPROCS
Same as -n, –ntasks. See $SLURM_NTASKS.
$SLURM_TASKS_PER_NODE
Number of tasks to be initiated on each node.
$SLURM_ARRAY_JOB_ID
Job array’s master job ID number.
$SLURM_ARRAY_TASK_ID
Job array ID (index) number.
$SLURM_ARRAY_TASK_COUNT
Total number of tasks in a job array.
$SLURM_ARRAY_TASK_MAX
Job array’s maximum ID (index) number.
$SLURM_ARRAY_TASK_MIN
Job array’s minimum ID (index) number.
A full list of environment variables for SLURM can be found by visiting the SLURM page on environment variables.
File Operations
File Distribution
sbcast
is used to transfer a file from local disk to local disk on the nodes allocated to a job. This can be used to effectively use diskless compute nodes or provide improved performance relative to a shared file system.
distribute file:Quickly copy files to all compute nodes assigned to the job, avoiding the hassle of manually distributing files. Faster than traditional scp or rsync, especially when distributing to multiple nodes。
simplify script:one command to distribute files to all nodes assigned to the job。
imrpove performance:Improve file distribution speed by parallelizing transfers, especially for large or multiple files。
#!/bin/bash
#SBATCH --job-name=example_job#SBATCH --output=example_job.out#SBATCH --error=example_job.err#SBATCH --partition=compute#SBATCH --nodes=4# Use sbcast to distribute the file to the /tmp directory of each nodesbcast data.txt /tmp/data.txt
# Run your program using the distributed filessrun my_program /tmp/data.txt
File Collection
File Redirection
When submitting a job, you can use the #SBATCH –output and #SBATCH –error directives to redirect standard output and standard error to specified files.
Send the destination address manually
Using scp or rsync in the job to copy the files from the compute nodes to the submit node
Using NFS
If a shared file system (such as NFS, Lustre, or GPFS) is configured in the computing cluster, the result files can be written directly to the shared directory. In this way, the result files generated by all nodes are automatically stored in the same location.
Using sbcast
Submit Jobs
3 Type Jobs
srun
is used to submit a job for execution or initiate job steps in real time.
salloc
is used to allocate resources for a job in real time. Typically this is used to allocate resources and spawn a shell. The shell is then used to execute srun commands to launch parallel tasks.
allocate resources (more like create an virtual machine)
salloc -N2 bash
This command will create a job which allocates 2 nodes and spawn a bash shell on each node. and you can execute srun commands in that environment. After your computing task is finsihs, remember to shutdown your job.
scancel <$job_id>
when you exit the job, the resources will be released.
#include<stdio.h>#include<mpi.h>intmain(intargc,char*argv[]){intrank,size;// 初始化MPI环境
MPI_Init(&argc,&argv);// 获取当前进程的rank和总进程数
MPI_Comm_rank(MPI_COMM_WORLD,&rank);MPI_Comm_size(MPI_COMM_WORLD,&size);// 输出进程的信息
printf("Hello, World! I am process %d out of %d processes.\n",rank,size);// 退出MPI环境
MPI_Finalize();return0;}
#include<stdio.h>#include<stdlib.h>#include<mpi.h>#define N 8 // 向量大小
// 计算向量的局部点积
doublecompute_local_dot_product(double*A,double*B,intstart,intend){doublelocal_dot=0.0;for(inti=start;i<end;i++){local_dot+=A[i]*B[i];}returnlocal_dot;}voidprint_vector(double*Vector){for(inti=0;i<N;i++){printf("%f ",Vector[i]);}printf("\n");}intmain(intargc,char*argv[]){intrank,size;// 初始化MPI环境
MPI_Init(&argc,&argv);MPI_Comm_rank(MPI_COMM_WORLD,&rank);MPI_Comm_size(MPI_COMM_WORLD,&size);// 向量A和B
doubleA[N],B[N];// 进程0初始化向量A和B
if(rank==0){for(inti=0;i<N;i++){A[i]=i+1;// 示例数据
B[i]=(i+1)*2;// 示例数据
}}// 广播向量A和B到所有进程
MPI_Bcast(A,N,MPI_DOUBLE,0,MPI_COMM_WORLD);MPI_Bcast(B,N,MPI_DOUBLE,0,MPI_COMM_WORLD);// 每个进程计算自己负责的部分
intlocal_n=N/size;// 每个进程处理的元素个数
intstart=rank*local_n;intend=(rank+1)*local_n;// 如果是最后一个进程,确保处理所有剩余的元素(处理N % size)
if(rank==size-1){end=N;}doublelocal_dot_product=compute_local_dot_product(A,B,start,end);// 使用MPI_Reduce将所有进程的局部点积结果汇总到进程0
doubleglobal_dot_product=0.0;MPI_Reduce(&local_dot_product,&global_dot_product,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);// 进程0输出最终结果
if(rank==0){printf("Vector A is\n");print_vector(A);printf("Vector B is\n");print_vector(B);printf("Dot Product of A and B: %f\n",global_dot_product);}// 结束MPI环境
MPI_Finalize();return0;}
#!/bin/bash
#SBATCH --job-name=mpi_job # Job name#SBATCH --nodes=2 # Number of nodes to use#SBATCH --ntasks-per-node=1 # Number of tasks per node#SBATCH --time=00:10:00 # Time limit#SBATCH --output=mpi_test_output_%j.log # Standard output file#SBATCH --error=mpi_test_output_%j.err # Standard error file# Manually set Intel OneAPI MPI and Compiler environmentexportI_MPI_PMI=pmi2
exportI_MPI_PMI_LIBRARY=/usr/lib/x86_64-linux-gnu/slurm/mpi_pmi2.so
exportI_MPI_ROOT=/opt/intel/oneapi/mpi/2021.14
exportINTEL_COMPILER_ROOT=/opt/intel/oneapi/compiler/2025.0
exportPATH=$I_MPI_ROOT/bin:$INTEL_COMPILER_ROOT/bin:$PATHexportLD_LIBRARY_PATH=$I_MPI_ROOT/lib:$INTEL_COMPILER_ROOT/lib:$LD_LIBRARY_PATHexportMANPATH=$I_MPI_ROOT/man:$INTEL_COMPILER_ROOT/man:$MANPATH# Compile the MPI programicx-cc -I$I_MPI_ROOT/include hello_mpi.c -o hello_mpi -L$I_MPI_ROOT/lib -lmpi
# Run the MPI jobmpirun -np 2 ./hello_mpi
#!/bin/bash
#SBATCH --job-name=mpi_job # Job name#SBATCH --nodes=2 # Number of nodes to use#SBATCH --ntasks-per-node=1 # Number of tasks per node#SBATCH --time=00:10:00 # Time limit#SBATCH --output=mpi_test_output_%j.log # Standard output file#SBATCH --error=mpi_test_output_%j.err # Standard error file# Manually set Intel OneAPI MPI and Compiler environmentexportI_MPI_PMI=pmi2
exportI_MPI_PMI_LIBRARY=/usr/lib/x86_64-linux-gnu/slurm/mpi_pmi2.so
exportI_MPI_ROOT=/opt/intel/oneapi/mpi/2021.14
exportINTEL_COMPILER_ROOT=/opt/intel/oneapi/compiler/2025.0
exportPATH=$I_MPI_ROOT/bin:$INTEL_COMPILER_ROOT/bin:$PATHexportLD_LIBRARY_PATH=$I_MPI_ROOT/lib:$INTEL_COMPILER_ROOT/lib:$LD_LIBRARY_PATHexportMANPATH=$I_MPI_ROOT/man:$INTEL_COMPILER_ROOT/man:$MANPATH# Compile the MPI programicx-cc -I$I_MPI_ROOT/include dot_product.c -o dot_product -L$I_MPI_ROOT/lib -lmpi
# Run the MPI jobmpirun -np 2 ./dot_product
#include<stdio.h>#include<mpi.h>intmain(intargc,char*argv[]){intrank,size;// 初始化MPI环境
MPI_Init(&argc,&argv);// 获取当前进程的rank和总进程数
MPI_Comm_rank(MPI_COMM_WORLD,&rank);MPI_Comm_size(MPI_COMM_WORLD,&size);// 输出进程的信息
printf("Hello, World! I am process %d out of %d processes.\n",rank,size);// 退出MPI环境
MPI_Finalize();return0;}
#include<stdio.h>#include<stdlib.h>#include<mpi.h>#define N 8 // 向量大小
// 计算向量的局部点积
doublecompute_local_dot_product(double*A,double*B,intstart,intend){doublelocal_dot=0.0;for(inti=start;i<end;i++){local_dot+=A[i]*B[i];}returnlocal_dot;}voidprint_vector(double*Vector){for(inti=0;i<N;i++){printf("%f ",Vector[i]);}printf("\n");}intmain(intargc,char*argv[]){intrank,size;// 初始化MPI环境
MPI_Init(&argc,&argv);MPI_Comm_rank(MPI_COMM_WORLD,&rank);MPI_Comm_size(MPI_COMM_WORLD,&size);// 向量A和B
doubleA[N],B[N];// 进程0初始化向量A和B
if(rank==0){for(inti=0;i<N;i++){A[i]=i+1;// 示例数据
B[i]=(i+1)*2;// 示例数据
}}// 广播向量A和B到所有进程
MPI_Bcast(A,N,MPI_DOUBLE,0,MPI_COMM_WORLD);MPI_Bcast(B,N,MPI_DOUBLE,0,MPI_COMM_WORLD);// 每个进程计算自己负责的部分
intlocal_n=N/size;// 每个进程处理的元素个数
intstart=rank*local_n;intend=(rank+1)*local_n;// 如果是最后一个进程,确保处理所有剩余的元素(处理N % size)
if(rank==size-1){end=N;}doublelocal_dot_product=compute_local_dot_product(A,B,start,end);// 使用MPI_Reduce将所有进程的局部点积结果汇总到进程0
doubleglobal_dot_product=0.0;MPI_Reduce(&local_dot_product,&global_dot_product,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);// 进程0输出最终结果
if(rank==0){printf("Vector A is\n");print_vector(A);printf("Vector B is\n");print_vector(B);printf("Dot Product of A and B: %f\n",global_dot_product);}// 结束MPI环境
MPI_Finalize();return0;}