#!/bin/bash
# SPDX-License-Identifier: GPL-2.0-only
#
# vm-run: Run command under QEMU emulated root
#
# Copyright (C) 2019 Vitaly Chikunov <vt@altlinux.org>
#

set -efu

# Save original env first
SCRIPT=$(mktemp)
( LOGNAME=root
  USER=root
  declare -fx
  export ) > $SCRIPT

# Additional options to qemu run
OPTS=
APPEND=
NOCMD=
type time >/dev/null 2>&1 && TIME=time || TIME=
SBIN=
VERBOSE=
QUIET=quiet	# no boot messages
DEF=
MAXMEM=
MAXCPU=
DEPMOD=
BIOS=
KVM=try		# try to use kvm by default
VIRTIOBUS=pci	# virtio bus to use

usage() {
	echo "Usage: $0 [options] command..."
	echo "Options:"
	echo "    -h|--help        this help"
	echo "    -s|--silent      do not show qemu command"
	echo "    --verbose        show vm actions verbosely"
	echo "    --no-quiet       do not boot kernel too quietly"
	echo "    --sbin           append sbin dirs to the PATH"
	echo "    --udevd          start udevd"
	echo "    --qemu='...'     pass additional options to qemu"
	echo "    --append='...'   append to kernel cmdline"
	echo "    --drive=         shortcut to pass '-drive file=' to qemu"
	echo "    --fat=dir        shortcut to pass dir as rw FAT partition"
	echo "    --overlay=fs[,size=][:path]"
	echo "                     auto create overlay filesystem over path"
	echo "    --bios=          Use particular firmware (ex. uefi)"
	echo "    --uefi           Use OVMF UEFI firmware (alias --bios=uefi)"
	echo "    --secureboot     Use OVMF SecureBoot fw (alias --bios=secureboot)"
	echo "    --microvm        Use microvm machine"
	echo "    --tcg            Do not try to enable KVM"
	echo "    --kvm=cond       Only run if KVM is present otherwise exit 0"
	echo "    --mem=           pass '-m ...' to qemu (memory size)"
	echo "    --cpu=           pass '-smp ...' to qemu (cpus)"
	echo "    --kernel=        kernel version to use (--kernels to list)"
	echo "    --depmod         auto-depmod %buildroot installed kernel"
	echo "    --               no options after this marker"
	echo "    command...       will be eval'ed on the target"
	exit 1
}

for opt do
	shift
	case "$opt" in
		-h|--help)   usage ;;
		-s|--silent) NOCMD=y TIME= ;;
		--verbose)   VERBOSE=1 APPEND+=" VERBOSE" ;;
		--no-quiet)  QUIET= ;;
		--sbin)      SBIN=y ;;
		--udevd)     APPEND+=" UDEVD=y" ;;
		--qemu=*)    OPTS+=" ${opt#*=}" ;;
		--append=*)  APPEND+=" ${opt#*=}" ;;
		--drive=*)   OPTS+=" -drive file=${opt#*=}" ;;
		--fat=*)     OPTS+=" -drive format=raw,file=fat:rw:${opt#*=}" ;;
		--overlay=*) APPEND+=" OVERLAY=${opt#*=}" ;;
		--uefi)	     BIOS=uefi ;;
		--securebo*) BIOS=secureboot ;;
		--bios=*)    BIOS="${opt#*=}" ;;
		--tcg)	     KVM= ;;
		--kvm=*)     KVM="${opt#*=}" ;;
		--microvm)   VIRTIOBUS=device OPTS+=" -M microvm" BIOS=microvm ;;
		--machine=*) OPTS+=" -machine ${opt#*=}" ;;
		--global=*)  OPTS+=" -global ${opt#*=}" ;;
		--object=*)  OPTS+=" -object ${opt#*=}" ;;
		--device=*)  OPTS+=" -device ${opt#*=}" ;;
		--drive=*)   OPTS+=" -drive ${opt#*=}" ;;
		--blockdev=*)OPTS+=" -blockdev ${opt#*=}" ;;
		--netdev=*)  OPTS+=" -netdev ${opt#*=}" ;;
		--chardev=*) OPTS+=" -chardev ${opt#*=}" ;;
		--mem=*)     OPTS+=" -m ${opt#*=}" ;;
		--cpu=*)     NPROC="${opt#*=}" ;;
		--kernel=*)  KERNEL="${opt#*=}" ;;
		--kernel*)   KERNEL= ;;
		--depmod)    DEPMOD=y ;;
		--) break ;;
		-*) echo "Unknown option $opt $*" >&2; usage ;;
		*)  set -- "$opt" "$@"; break ;;
	esac
done

# If no command run shell
if [ -z "$*" ]; then
	set bash
	SBIN=y
fi

# If we already have root just run the command
if [ $(env - whoami) = root ]; then
	exec "$@"
	exit
fi

# Signal to vm-init that we don't have tty on stdin,
# so it will not try to determine terminal size
if [ ! -t 0 ]; then
	APPEND+=" NOTTY"
fi

# Prepare env
[ $SBIN ] && echo "PATH=/sbin:/usr/sbin:\$PATH" >> $SCRIPT
printf "cd %q\n" "$PWD" >> $SCRIPT
printf "eval" >> $SCRIPT
printf  " %q" "$@" >> $SCRIPT
printf  "\n" >> $SCRIPT
# Flush console.
printf  "RET=\$?\nstty sane\nexit \$RET\n" >> $SCRIPT
chmod a+rx $SCRIPT

kvm_ok() {
	if [ -c /dev/kvm -a -w /dev/kvm ]; then
		true < /dev/kvm
	else
		false
	fi 2>/dev/null
}

kvm_need() {
	case "$KVM" in
		no|off|false|tcg|"") false ;;
		*) true ;;
	esac
}

ppc_opts() {
	# Use `power8' and not `host', because we will transparently
	# fallback from kvm to tcg. All following options will not
	# break fallback to tcg.
	echo -n " -M cap-ibs=broken,cap-cfpc=broken,cap-sbbc=broken"
	echo -n " -cpu power8"
	if kvm_need && kvm_ok; then
		# In absence of `/usr/sbin/ppc64_cpu' determine SMT by presence
		# online cores 1-7, (while cores 0,8,.. is not SMT cores.)
		if grep -q -P '^processor\s+:\s[1234567]$' /proc/cpuinfo; then
			# If SMT enabled use slower PR (problem state) KVM.
			echo " -M kvm-type=PR"
		else
			# KVM HV is faster, but incompatible with SMT.
			echo " -M kvm-type=HV,cap-ccf-assist=off"
		fi
	else
		# KVM doesn't support this.
		echo -n ",compat=power7"
	fi
}

case "$HOSTTYPE" in
	powerpc64le)
		PACKAGE=qemu-system-ppc-core
		CONSOLE=hvc0
		QEMU=qemu-system-ppc64
		OPTS+=$(ppc_opts)
		# Avoid `CPU time limit exceeded'
		MAXMEM=32768
		MAXCPU=8
		;;
	aarch64)
		PACKAGE=qemu-system-aarch64-core
		CONSOLE=ttyAMA0
		QEMU=qemu-system-aarch64
		# Values that can work well both for kvm and tcg
		OPTS+=" -M virt,gic-version=3 -cpu max"
		# More cpu and memory is slower launch, so limit them
		# to sane big values
		MAXMEM=4096
		MAXCPU=8
		EDK2=/usr/share/AAVMF
		EFI_CODE=QEMU_EFI.fd
		;;
	armh)
		PACKAGE=qemu-system-arm-core
		CONSOLE=ttyAMA0
		QEMU=qemu-system-arm
		OPTS+=" -M virt,highmem=off -cpu max"
		APPEND+=" watchdog_thresh=60"
		MAXMEM=2047
		MAXCPU=4
		;;
	i586)
		PACKAGE=qemu-system-x86-core
		CONSOLE=ttyS0
		QEMU=qemu-system-i386
		MAXMEM=1536
		;;
	x86_64)
		PACKAGE=qemu-system-x86-core
		CONSOLE=ttyS0
		QEMU=qemu-system-x86_64
		EDK2=/usr/share/OVMF
		EFI_CODE=OVMF_CODE.fd
		SB_PFLASH="OVMF_CODE.secboot.fd OVMF_VARS.secboot.fd"
		;;
	*)
		echo "Error: architecture $HOSTTYPE is unknown." >&2
		exit 1
esac

case "$HOSTTYPE" in
	i586|x86_64)
		APPEND+=" no_timer_check" ;;
esac

# Set up BIOS.
if [ "$BIOS" = "uefi" ]; then
	if [ -n "${EFI_CODE-}" ]; then
		OPTS+=" -bios $EDK2/$EFI_CODE"
	else
		echo "Error: UEFI is not available for this platform." >&2
		exit 1
	fi
elif [ "$BIOS" = "secureboot" ]; then
	if [ -n " ${SB_PFLASH-}" ]; then
		OPTS+=" -global ICH9-LPC.disable_s3=1 -M q35,smm=on"
		OPTS+=" -global driver=cfi.pflash01,property=secure,value=on"
		for f in ${SB_PFLASH-}; do
			OPTS+=" -drive file=$EDK2/$f,if=pflash,format=raw,readonly"
		done
	else
		echo "Error: Secure Boot is not available for this platform." >&2
		exit 1
	fi
elif [ "$BIOS" = "microvm" ]; then
	OPTS+=" -bios bios-microvm.bin"
elif [ -f "/usr/share/qemu/$BIOS" ]; then
	OPTS+=" -bios $BIOS"
elif [ -n "$BIOS" ]; then
	OPTS+=" -bios $BIOS"
elif [ -e /usr/share/qemu/bios.bin ]; then
	# x86 BIOS that doesn't clear screen unlike default bios-256k.bin
	OPTS+=" -bios bios.bin"
fi

if kvm_need; then
	case "$KVM" in
		try|detect)
			# Avoid qemu warning:
			#   Could not access KVM kernel module: No such file or directory
			#   qemu-system-x86_64: failed to initialize kvm: No such file or directory
			#   qemu-system-x86_64: falling back to tcg
			kvm_ok && DEF+=" -M accel=kvm:tcg" || DEF+=" -M accel=tcg"
			;;
		cond|if)
			if ! kvm_ok; then
				echo "Warning: Not running due to no KVM support (exit 0)."
				exit 0
			fi
			DEF+=" -enable-kvm"
			;;
		only|force|enable)
			DEF+=" -enable-kvm"
			;;
		any|all)
			DEF+=" -M accel=kvm:tcg"
			;;
		default)
			;;
		*)
			echo "Error: Unknown --kvm=$KVM option."
			exit 1
			;;
	esac
else
	# Forcefully disable KVM.
	DEF+=" -accel tcg"
fi
unset KVM

if ! mountpoint -q /proc; then
	echo "===========================================" >&2
	echo "Warning: /proc is not mounted              " >&2
	echo "try: hsh-shell --mountpoints=/proc,/dev/kvm" >&2
	echo "===========================================" >&2
fi

if [ -e /proc/meminfo ]; then
        M=$(grep MemAvailable: /proc/meminfo)
        M=${M##MemAvailable: }
        M=${M%% kB}
        M=$(($M / 1024))
	if [ "$MAXMEM" ] && [ "$M" -gt "$MAXMEM" ]; then
		M=$MAXMEM
	fi
else
        M=256

fi
# Round to 256M for ppc
[ "$HOSTTYPE" = powerpc64le ] && M=$((M / 256 * 256))
if [ $M -ge 256 ]; then
        DEF+=" -m ${M}M"
fi

[ -v NPROC ] || NPROC=$(nproc)
if [ "$NPROC" -gt 1 ]; then
	if [ "$MAXCPU" ] && [ "$NPROC" -gt "$MAXCPU" ]; then
		NPROC=$MAXCPU
	fi
	DEF+=" -smp cores=$NPROC"
fi

if [ -e /proc/net/dev ] && [ $(wc -l < /proc/net/dev) -gt 3 ]; then
	DEF+=" -nic user,model=virtio-net-pci"
	if type ip >/dev/null 2>&1; then
		:
	elif type ifconfig >/dev/null 2>&1; then
		:
	else
		echo "Warning: To use share_network inside vm-run you may" >&2
		echo "  need to hsh-install iproute2 or net-tools" >&2
	fi
fi

# Sorted by preference (%buildroot first) and mtime (newer first)
list_kernels() {
	(
		set +ef
		if [ -n "${RPM_BUILD_ROOT-}" ]; then
			# Normal build, we know exact %buildroot location.
			ls -t "$RPM_BUILD_ROOT"/boot/vmlinu[xz]-*
		else
			# hsh-shell, try to guess %buildroot.
			ls -t /usr/src/tmp/*-buildroot/boot/vmlinu[xz]-*
		fi
		ls -t /boot/vmlinu[xz]-*
	) 2>/dev/null
}

list_kernels_ui() {
	if list_kernels | grep -q .; then
		echo "  List of available kernels:" >&2
		list_kernels | sed 's/^/\t/' >&2
	else
		echo "  No kernels found, try to install a kernel first." >&2
	fi
}

guess_kernel() {
	local match=${1-}

	# Better match first.
	list_kernels | grep -s -m1 -e ".*/$match" -x && return
	list_kernels | grep -s -m1 -e    "$match" -w && return
	list_kernels | grep -s -m1 -e    "$match"    && return
	return 0
}

if [ -f "${KERNEL-}" ]; then
	KERN=$KERNEL
elif [ ! -v KERNEL ]; then
	# No --kernel= option.
	KERN=$(guess_kernel)
elif [ -n "$KERNEL" ]; then
	# Non-empty --kernel= option.
	KERN=$(guess_kernel "$KERNEL")
else
	# Empty --kernel= option for easy listing.
	list_kernels_ui
	exit 1
fi

if [ ! -f "$KERN" ]; then
	if [ -n "${KERNEL-}" ]; then
		echo "Error: Cannot find the kernel matching '$KERNEL'." >&2
	else
		echo "Error: Cannot find a kernel, try to use '--kernel=' option." >&2
	fi
	list_kernels_ui
	exit 1
fi
KERNEL=$KERN
unset KERN

ROOT=${KERNEL%%/boot/*}
if [ -n "$ROOT" ]; then
	# Two spaces to defend against accidental injection, since
	# command is '%q'-escaped and cannot contain consecutive spaces.
	printf '#  MODPROBE_OPTIONS=%q\n' "--dirname=$ROOT" >> $SCRIPT
fi

# Generate initramfs.
if [ ! -f "${INITRD-}" ]; then
	# Kernel version.
	KVER=${KERNEL##*/vmlinu[xz]-}
	if [ ! -d "$ROOT/lib/modules/$KVER" ]; then
		echo "Error: Cannot find $ROOT/lib/modules/$KVER for your kernel." >&2
		echo "       Please, install the kernel or make modules_install!"  >&2
		exit 1
	fi
	if [ ! -s "$ROOT/lib/modules/$KVER/modules.dep.bin" ]; then
		if [ -w "$ROOT/lib/modules/$KVER" ]; then
			if [ -z "$DEPMOD" ] && [[ "$ROOT" =~ buildroot ]]; then
				echo "  Run 'depmod -b $ROOT $KVER'" >&2
				echo "  or start vm-run with '--depmod' option for automatic depmod." >&2
				exit 1
			else
				( set -x; /sbin/depmod ${ROOT:+-b $ROOT} $KVER )
			fi
		else
			echo "Error: No valid 'modules.dep.bin' in $ROOT/lib/modules/$KVER"  >&2
			echo "       which is also not writable directory. As a result 'modprobe' will not work!"  >&2
			exit 1
		fi
	fi
	INITRD=${TMPDIR:-/tmp}/initramfs-$KVER.img
	CPIO=$(mktemp -d)
	MODULES="9p 9pnet_virtio "
	[ "$VIRTIOBUS" = "pci" ] && MODULES+="virtio_pci" || MODULES+="virtio_mmio"
	if ! /sbin/modprobe -d "$ROOT" -S "$KVER" -D -a $MODULES > "$CPIO/modprobe.txt"; then
		echo "Error: modprobe failure." >&2
		if [ ! -s "$ROOT/lib/modules/$KVER/modules.dep.bin" ]; then
			[ -w "$ROOT/lib/modules/$KVER/" ] && HELP= \
				|| HELP=" under rooter"
			echo "  Run 'depmod${ROOT:+ -b $ROOT} $KVER'$HELP." >&2
		fi
		exit 1
	fi
	while read skip f; do
		bn=${f##*/}
		un=${bn%.?z}
		[ -e "$CPIO/$un" ] && continue
		cp -au "$f" "$CPIO/" 2>/dev/null || continue
		case "$bn" in
			*.gz) gzip -qd "$CPIO/$bn" ;;
			*.xz) xz   -qd "$CPIO/$bn" ;;
		esac
		printf '%s\n' "$un" >&3
	done < "$CPIO/modprobe.txt" \
		3> "$CPIO/modules.conf"
	cp -au /usr/lib/vm-run/initrd-init "$CPIO/init"
	rm "$CPIO/modprobe.txt"
	(cd "$CPIO"; find . | cpio -o -H newc --quiet -R 0:0 | gzip) > "$INITRD"
	rm -rf "$CPIO"
	unset CPIO MODULES KVER skip f bn un
fi
unset ROOT

# -serial mon:stdio  to make ^C not break qemu
# -device virtio-rng-pci  for virtio-rng
(
	set +f # Enable filename globbing
	if [ ! -t 0 ]; then
		# More debugging info on qemu crashes.
		VM_RUN_LIBSEGFAULT=$(echo /usr/lib*/libSegFault.so)
		if [ -f "$VM_RUN_LIBSEGFAULT" ]; then
			LD_PRELOAD=${LD_PRELOAD:+${LD_PRELOAD}:}$VM_RUN_LIBSEGFAULT
			SEGFAULT_USE_ALTSTACK=1
			export LD_PRELOAD SEGFAULT_USE_ALTSTACK
		fi
	fi
	test "$NOCMD" || set -x
	$TIME \
	$QEMU \
		$DEF \
		-nodefaults \
		-nographic \
		-no-reboot \
		-fsdev local,id=root,path=/,security_model=none,multidevs=remap \
		-device virtio-9p-$VIRTIOBUS,fsdev=root,mount_tag=/dev/root \
		-device virtio-rng-$VIRTIOBUS \
		-serial mon:stdio \
		-kernel $KERNEL \
		-initrd $INITRD \
		$OPTS \
		-append "console=$CONSOLE mitigations=off nokaslr $QUIET panic=-1 SCRIPT=$SCRIPT$APPEND"
)

# Temporary script is not deleted intentionally.
if [ -s $SCRIPT.ret ]; then
	read ret < $SCRIPT.ret
	exit $ret
else
	echo "Error: no exit code is found, failure." >&2
	exit 1
fi
