Files
rancher/scripts/provisioning-tests
2025-07-15 14:53:03 -04:00

258 lines
12 KiB
Bash
Executable File

#!/bin/bash
set -e
if ./scripts/only-ui-bumps.sh; then
exit 0
fi
#########################################################################################################################################
# DISCLAIMER #
# Copied from https://github.com/moby/moby/blob/ed89041433a031cafc0a0f19cfe573c31688d377/hack/dind#L28-L37 #
# Permission granted by Akihiro Suda <akihiro.suda.cz@hco.ntt.co.jp> (https://github.com/rancher/k3d/issues/493#issuecomment-827405962) #
# Moby License Apache 2.0: https://github.com/moby/moby/blob/ed89041433a031cafc0a0f19cfe573c31688d377/LICENSE #
#########################################################################################################################################
# only run this if rancher is not running in kubernetes cluster and if the init cgroup does not already exist
# we have to run this early on in provisioning-tests otherwise, the moving of processes is likely to fail due to
# non-existent PIDs from the retry script below to check for Rancher health
if [ -f /sys/fs/cgroup/cgroup.controllers ] && [ ! -d /sys/fs/cgroup/init ]; then
# move the processes from the root group to the /init group,
# otherwise writing subtree_control fails with EBUSY.
mkdir -p /sys/fs/cgroup/init
xargs -rn1 < /sys/fs/cgroup/cgroup.procs > /sys/fs/cgroup/init/cgroup.procs || :
# enable controllers
sed -e 's/ / +/g' -e 's/^/+/' <"/sys/fs/cgroup/cgroup.controllers" >"/sys/fs/cgroup/cgroup.subtree_control"
fi
cleanup()
{
EXIT=$?
set +ex
echo Stopping rancher server
kill $RANCHER_RUN_PID
wait $RANCHER_RUN_PID
if [ $PID != -1 ]; then
kill $PID
wait $PID
fi
return $EXIT
}
cd $(dirname $0)/..
TB_ORG=rancher
if [ -z "${V2PROV_TEST_DIST}" ] || [ "${V2PROV_TEST_DIST}" = "k3s" ]; then
V2PROV_TEST_DIST=k3s
AIRGAP=-airgap
TB_ORG=k3s-io
else
LINUX=.linux
fi
if [ -z "${V2PROV_TEST_RUN_REGEX}" ]; then
V2PROV_TEST_RUN_REGEX="^Test_(General|Provisioning|Fleet)_.*$"
fi
RUNARG="-run ${V2PROV_TEST_RUN_REGEX}"
export DIST=${V2PROV_TEST_DIST}
export SOME_K8S_VERSION=${SOME_K8S_VERSION}
export TB_ORG=${TB_ORG}
export CATTLE_CHART_DEFAULT_URL=${CATTLE_CHART_DEFAULT_URL}
export CATTLE_FEATURES=${CATTLE_FEATURES}
# Tell Rancher to use the recently-built Rancher cluster agent image. This image is built as part of CI and will be
# copied to the in-cluster registry during test setup below.
source ./scripts/version
# defaulting to :head (matching the hardcoded value within rancher) if CATTLE_AGENT_IMAGE or TAG is not specified
if [ $TAG = "dev" ]; then
TAG="head"
fi
export CATTLE_AGENT_IMAGE=${CATTLE_AGENT_IMAGE:-rancher/rancher-agent:${TAG:-head}}
echo "Using Rancher agent image $CATTLE_AGENT_IMAGE"
eval "$(grep '^ENV CATTLE_SYSTEM_AGENT' package/Dockerfile | awk '{print "export " $2}')"
eval "$(grep '^ENV CATTLE_WINS_AGENT' package/Dockerfile | awk '{print "export " $2}')"
eval "$(grep '^ENV CATTLE_CSI_PROXY_AGENT' package/Dockerfile | awk '{print "export " $2}')"
eval "$(grep '^ENV CATTLE_KDM_BRANCH' package/Dockerfile | awk '{print "export " $2}')"
eval "$(grep '^ENV CATTLE_SYSTEM_UPGRADE_CONTROLLER_CHART_VERSION' package/Dockerfile | awk '{print "export " $2}')"
eval "$(grep '^ENV CATTLE_SYSTEM_AGENT_DOWNLOAD_PREFIX' package/Dockerfile | awk '{print "export " $2}')"
eval "$(grep '^ENV CATTLE_SYSTEM_AGENT_VERSION' package/Dockerfile | awk '{print "export " $2}')"
eval "$(grep '^ENV CATTLE_SYSTEM_AGENT_UPGRADE_IMAGE' package/Dockerfile | awk '{print "export " $2}')"
export CATTLE_RANCHER_WEBHOOK_VERSION=$(grep -m1 'webhookVersion' build.yaml | cut -d ' ' -f2)
export CATTLE_REMOTEDIALER_PROXY_VERSION=$(grep -m1 'remoteDialerProxyVersion' build.yaml | cut -d ' ' -f2)
export CATTLE_RANCHER_PROVISIONING_CAPI_VERSION=$(grep -m1 'provisioningCAPIVersion' build.yaml | cut -d ' ' -f2)
export CATTLE_CSP_ADAPTER_MIN_VERSION=$(grep -m1 'cspAdapterMinVersion' build.yaml | cut -d ' ' -f2)
export CATTLE_FLEET_VERSION=$(grep -m1 'fleetVersion' build.yaml | cut -d ' ' -f2)
if [ -z "${SOME_K8S_VERSION}" ]; then
# Only set SOME_K8S_VERSION if it is empty -- for KDM provisioning tests, this value should already be populated
# Get the last release for $DIST, which is usually the latest version or an experimental version.
# Previously this would use channels, but channels no longer reflect the latest version since
# https://github.com/rancher/rancher/issues/36827 has added appDefaults. We do not use appDefaults
# here for simplicity's sake, as it requires semver parsing & matching. The last release should
# be good enough for our needs.
export SOME_K8S_VERSION=$(curl -sS https://raw.githubusercontent.com/rancher/kontainer-driver-metadata/dev-v2.12/data/data.json | jq -r ".$DIST.releases[-1].version")
fi
if [ -z "${CATTLE_CHART_DEFAULT_URL}" ]; then
# If `CATTLE_CHART_DEFAULT_URL` is not set, use the `https://github.com/rancher/charts` so GitHub is used instead of
# the default `https://git.rancher.io/charts` to reduce the reliance and load on our Git mirror
export CATTLE_CHART_DEFAULT_URL=https://github.com/rancher/charts
fi
echo "Starting rancher server for provisioning-tests using $SOME_K8S_VERSION"
touch /tmp/rancher.log
mkdir -p /var/lib/rancher/$DIST/agent/images
grep PodTestImage ./tests/v2prov/defaults/defaults.go | cut -f2 -d'"' > /var/lib/rancher/$DIST/agent/images/pull.txt
grep MachineProvisionImage ./pkg/settings/setting.go | cut -f4 -d'"' >> /var/lib/rancher/$DIST/agent/images/pull.txt
mkdir -p /usr/share/rancher/ui/assets
curl -sLf ${CATTLE_SYSTEM_AGENT_DOWNLOAD_PREFIX}/${CATTLE_SYSTEM_AGENT_VERSION}/rancher-system-agent-amd64 -o /usr/share/rancher/ui/assets/rancher-system-agent-amd64
curl -sLf ${CATTLE_SYSTEM_AGENT_DOWNLOAD_PREFIX}/${CATTLE_SYSTEM_AGENT_VERSION}/rancher-system-agent-arm64 -o /usr/share/rancher/ui/assets/rancher-system-agent-arm64
curl -sLf ${CATTLE_SYSTEM_AGENT_DOWNLOAD_PREFIX}/${CATTLE_SYSTEM_AGENT_VERSION}/system-agent-uninstall.sh -o /usr/share/rancher/ui/assets/system-agent-uninstall.sh
build_and_run_rancher()
{
RESTART_COUNT=0
while sleep 2; do
if [ "$PID" != "-1" ] && [ ! -e /proc/$PID ]; then
echo Rancher died
dump_rancher_logs
echo K3s logs were:
echo -e "-----K3S-LOG-DUMP-START-----"
cat build/testdata/k3s.log | gzip | base64 -w 0
echo -e "\n-----K3S-LOG-DUMP-END-----"
set +e
echo Attempting to kill K3s
pkill -e k3s
set -e
PID=-1
if [ "$RESTART_COUNT" = "2" ]; then
echo Rancher died 3 times, aborting
kill -42 $PWRAPPROC
fi
RESTART_COUNT=$((RESTART_COUNT + 1))
sleep 5
fi
if [ "$PID" = "-1" ]; then
echo Starting rancher server using run
./scripts/run >/tmp/rancher.log 2>&1 &
PID=$!
fi
sleep 2
done
}
dump_rancher_logs()
{
echo Rancher logs were
echo -e "-----RANCHER-LOG-DUMP-START-----"
cat /tmp/rancher.log | gzip | base64 -w 0
echo -e "\n-----RANCHER-LOG-DUMP-END-----"
}
export -f dump_rancher_logs
push_cattle_agent_image()
{
echo "pushing local cattle-agent image - waiting for registry-cache to be available"
./scripts/retry \
--timeout 300 \
--sleep 2 \
--message "registry-cache was not available after {{elapsed}} seconds" \
--message-interval 30 \
--exit-command "echo registry-cache never came up. RIP" \
"kubectl --kubeconfig /etc/rancher/k3s/k3s.yaml get pod registry-cache -n default &>/dev/null"
echo "registry-cache pod found! waiting for pod to be running"
kubectl --kubeconfig /etc/rancher/k3s/k3s.yaml wait --for=condition=ready pod/registry-cache -n default
# we're referring to the docker IP of the current pod...but since the docker socket is from the host localhost
# won't work. so we refer to it by the internal docker IP of the container to make it all work.
echo "setting up credentials"
docker login -u admin -p admin https://172.17.0.2:5000
echo "pushing ${CATTLE_AGENT_IMAGE}"
docker push 172.17.0.2:5000/$CATTLE_AGENT_IMAGE
echo "successfully pushed $CATTLE_AGENT_IMAGE to registry-cache"
}
# Get the rancher binary ready
#
# This needs to happen before build_and_run_rancher is executed in the background.
# Otherwise, build_and_run_rancher will also compile Rancher, overly elongating the
# time before Rancher is ready and therefore causing flakiness of the health check below.
if ! docker image inspect rancher/rancher:$TAG >/dev/null 2>&1 ; then
echo "building rancher from source - no preloaded container image available"
./scripts/build-server
else
# otherwise just copy it from the artifacts that are already there. neat!
echo "pulling bin/rancher from preloaded container image"
container_id=$(docker create rancher/rancher:$TAG)
docker cp $container_id:/usr/bin/rancher bin/rancher
docker rm $container_id
fi
# uncomment to get startup logs. Don't leave them on because it slows drone down too
# much
#tail -F /tmp/rancher.log &
#TPID=$!
trap "exit 1" 42
PWRAPPROC="$$"
PID=-1
build_and_run_rancher &
RANCHER_RUN_PID=$!
trap cleanup exit
echo "Waiting for Rancher to be healthy"
./scripts/retry --sleep 2 --timeout 300 "curl -sf -o /dev/null http://localhost:8080/ping"
# The remotedialer-proxy is pulled in by the api-extension deployment
echo "Waiting up to 5 minutes for the api-extension deployment"
./scripts/retry \
--timeout 300 `# Time out after 300 seconds (5 min)` \
--sleep 2 `# Sleep for 2 seconds in between attempts` \
--message-interval 30 `# Print the progress message below every 30 attempts (roughly every minute)` \
--message "api-extension was not available after {{elapsed}} seconds" `# Print this progress message` \
--exit-command "dump_rancher_logs" `# Dump logs to find out why api-extension did not start` \
"kubectl --kubeconfig /etc/rancher/k3s/k3s.yaml rollout status -n cattle-system deploy/api-extension &>/dev/null"
echo "Waiting up to 5 minutes for rancher-webhook deployment"
./scripts/retry \
--timeout 300 `# Time out after 300 seconds (5 min)` \
--sleep 2 `# Sleep for 2 seconds in between attempts` \
--message-interval 30 `# Print the progress message below every 30 attempts (roughly every minute)` \
--message "rancher-webhook was not available after {{elapsed}} seconds" `# Print this progress message` \
--exit-command "dump_rancher_logs" `# Dump logs to find out why webhook did not start` \
"kubectl --kubeconfig /etc/rancher/k3s/k3s.yaml rollout status -n cattle-system deploy/rancher-webhook &>/dev/null"
echo "Waiting up to 5 minutes for rancher-provisioning-capi deployment"
./scripts/retry \
--timeout 300 `# Time out after 300 seconds (5 min)` \
--sleep 2 `# Sleep for 2 seconds in between attempts` \
--message-interval 30 `# Print the progress message below every 30 attempts (roughly every minute)` \
--message "rancher-provisioning-capi was not available after {{elapsed}} seconds" `# Print this progress message` \
--exit-command "dump_rancher_logs" `# Dump logs to find out why capi-controller-manager did not start` \
"kubectl --kubeconfig /etc/rancher/k3s/k3s.yaml rollout status -n cattle-provisioning-capi-system deploy/capi-controller-manager &>/dev/null"
# only push the image if we're not using the default image:tag
if [ "${CATTLE_AGENT_IMAGE}" != "rancher/rancher-agent:head" ]; then
( push_cattle_agent_image & )
fi
#kill $TPID
echo Running provisioning-tests $RUNARG
export KUBECONFIG=/etc/rancher/k3s/k3s.yaml
go test $RUNARG -v -failfast -timeout 60m ./tests/v2prov/tests/... || {
dump_rancher_logs
exit 1
}