#!/bin/bash
# SPDX-License-Identifier: GPL-2.0

# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4

set -e

if [[ $(id -u) -ne 0 ]]; then
  echo "This test must be run as root. Skipping..."
  exit $ksft_skip
fi

nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages)

fault_limit_file=limit_in_bytes
reservation_limit_file=rsvd.limit_in_bytes
fault_usage_file=usage_in_bytes
reservation_usage_file=rsvd.usage_in_bytes

if [[ "$1" == "-cgroup-v2" ]]; then
  cgroup2=1
  fault_limit_file=max
  reservation_limit_file=rsvd.max
  fault_usage_file=current
  reservation_usage_file=rsvd.current
fi

if [[ $cgroup2 ]]; then
  cgroup_path=$(mount -t cgroup2 | head -1 | awk '{print $3}')
  if [[ -z "$cgroup_path" ]]; then
    cgroup_path=/dev/cgroup/memory
    mount -t cgroup2 none $cgroup_path
    do_umount=1
  fi
  echo "+hugetlb" >$cgroup_path/cgroup.subtree_control
else
  cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk '{print $3}')
  if [[ -z "$cgroup_path" ]]; then
    cgroup_path=/dev/cgroup/memory
    mount -t cgroup memory,hugetlb $cgroup_path
    do_umount=1
  fi
fi
export cgroup_path

function cleanup() {
  if [[ $cgroup2 ]]; then
    echo $$ >$cgroup_path/cgroup.procs
  else
    echo $$ >$cgroup_path/tasks
  fi

  if [[ -e /mnt/huge ]]; then
    rm -rf /mnt/huge/*
    umount /mnt/huge || echo error
    rmdir /mnt/huge
  fi
  if [[ -e $cgroup_path/hugetlb_cgroup_test ]]; then
    rmdir $cgroup_path/hugetlb_cgroup_test
  fi
  if [[ -e $cgroup_path/hugetlb_cgroup_test1 ]]; then
    rmdir $cgroup_path/hugetlb_cgroup_test1
  fi
  if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then
    rmdir $cgroup_path/hugetlb_cgroup_test2
  fi
  echo 0 >/proc/sys/vm/nr_hugepages
  echo CLEANUP DONE
}

function expect_equal() {
  local expected="$1"
  local actual="$2"
  local error="$3"

  if [[ "$expected" != "$actual" ]]; then
    echo "expected ($expected) != actual ($actual): $3"
    cleanup
    exit 1
  fi
}

function get_machine_hugepage_size() {
  hpz=$(grep -i hugepagesize /proc/meminfo)
  kb=${hpz:14:-3}
  mb=$(($kb / 1024))
  echo $mb
}

MB=$(get_machine_hugepage_size)

function setup_cgroup() {
  local name="$1"
  local cgroup_limit="$2"
  local reservation_limit="$3"

  mkdir $cgroup_path/$name

  echo writing cgroup limit: "$cgroup_limit"
  echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file

  echo writing reseravation limit: "$reservation_limit"
  echo "$reservation_limit" > \
    $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file

  if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then
    echo 0 >$cgroup_path/$name/cpuset.cpus
  fi
  if [ -e "$cgroup_path/$name/cpuset.mems" ]; then
    echo 0 >$cgroup_path/$name/cpuset.mems
  fi
}

function wait_for_hugetlb_memory_to_get_depleted() {
  local cgroup="$1"
  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
  # Wait for hugetlbfs memory to get depleted.
  while [ $(cat $path) != 0 ]; do
    echo Waiting for hugetlb memory to get depleted.
    cat $path
    sleep 0.5
  done
}

function wait_for_hugetlb_memory_to_get_reserved() {
  local cgroup="$1"
  local size="$2"

  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file"
  # Wait for hugetlbfs memory to get written.
  while [ $(cat $path) != $size ]; do
    echo Waiting for hugetlb memory reservation to reach size $size.
    cat $path
    sleep 0.5
  done
}

function wait_for_hugetlb_memory_to_get_written() {
  local cgroup="$1"
  local size="$2"

  local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file"
  # Wait for hugetlbfs memory to get written.
  while [ $(cat $path) != $size ]; do
    echo Waiting for hugetlb memory to reach size $size.
    cat $path
    sleep 0.5
  done
}

function write_hugetlbfs_and_get_usage() {
  local cgroup="$1"
  local size="$2"
  local populate="$3"
  local write="$4"
  local path="$5"
  local method="$6"
  local private="$7"
  local expect_failure="$8"
  local reserve="$9"

  # Function return values.
  reservation_failed=0
  oom_killed=0
  hugetlb_difference=0
  reserved_difference=0

  local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file
  local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file

  local hugetlb_before=$(cat $hugetlb_usage)
  local reserved_before=$(cat $reserved_usage)

  echo
  echo Starting:
  echo hugetlb_usage="$hugetlb_before"
  echo reserved_usage="$reserved_before"
  echo expect_failure is "$expect_failure"

  output=$(mktemp)
  set +e
  if [[ "$method" == "1" ]] || [[ "$method" == 2 ]] ||
    [[ "$private" == "-r" ]] && [[ "$expect_failure" != 1 ]]; then

    bash write_hugetlb_memory.sh "$size" "$populate" "$write" \
      "$cgroup" "$path" "$method" "$private" "-l" "$reserve" 2>&1 | tee $output &

    local write_result=$?
    local write_pid=$!

    until grep -q -i "DONE" $output; do
      echo waiting for DONE signal.
      if ! ps $write_pid > /dev/null
      then
        echo "FAIL: The write died"
        cleanup
        exit 1
      fi
      sleep 0.5
    done

    echo ================= write_hugetlb_memory.sh output is:
    cat $output
    echo ================= end output.

    if [[ "$populate" == "-o" ]] || [[ "$write" == "-w" ]]; then
      wait_for_hugetlb_memory_to_get_written "$cgroup" "$size"
    elif [[ "$reserve" != "-n" ]]; then
      wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size"
    else
      # This case doesn't produce visible effects, but we still have
      # to wait for the async process to start and execute...
      sleep 0.5
    fi

    echo write_result is $write_result
  else
    bash write_hugetlb_memory.sh "$size" "$populate" "$write" \
      "$cgroup" "$path" "$method" "$private" "$reserve"
    local write_result=$?

    if [[ "$reserve" != "-n" ]]; then
      wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size"
    fi
  fi
  set -e

  if [[ "$write_result" == 1 ]]; then
    reservation_failed=1
  fi

  # On linus/master, the above process gets SIGBUS'd on oomkill, with
  # return code 135. On earlier kernels, it gets actual oomkill, with return
  # code 137, so just check for both conditions in case we're testing
  # against an earlier kernel.
  if [[ "$write_result" == 135 ]] || [[ "$write_result" == 137 ]]; then
    oom_killed=1
  fi

  local hugetlb_after=$(cat $hugetlb_usage)
  local reserved_after=$(cat $reserved_usage)

  echo After write:
  echo hugetlb_usage="$hugetlb_after"
  echo reserved_usage="$reserved_after"

  hugetlb_difference=$(($hugetlb_after - $hugetlb_before))
  reserved_difference=$(($reserved_after - $reserved_before))
}

function cleanup_hugetlb_memory() {
  set +e
  local cgroup="$1"
  if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then
    echo killing write_to_hugetlbfs
    killall -2 --wait write_to_hugetlbfs
    wait_for_hugetlb_memory_to_get_depleted $cgroup
  fi
  set -e

  if [[ -e /mnt/huge ]]; then
    rm -rf /mnt/huge/*
    umount /mnt/huge
    rmdir /mnt/huge
  fi
}

function run_test() {
  local size=$(($1 * ${MB} * 1024 * 1024))
  local populate="$2"
  local write="$3"
  local cgroup_limit=$(($4 * ${MB} * 1024 * 1024))
  local reservation_limit=$(($5 * ${MB} * 1024 * 1024))
  local nr_hugepages="$6"
  local method="$7"
  local private="$8"
  local expect_failure="$9"
  local reserve="${10}"

  # Function return values.
  hugetlb_difference=0
  reserved_difference=0
  reservation_failed=0
  oom_killed=0

  echo nr hugepages = "$nr_hugepages"
  echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages

  setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit"

  mkdir -p /mnt/huge
  mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge

  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \
    "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \
    "$reserve"

  cleanup_hugetlb_memory "hugetlb_cgroup_test"

  local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file)
  local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file)

  echo $hugetlb_difference
  echo $reserved_difference
  expect_equal "0" "$final_hugetlb" "final hugetlb is not zero"
  expect_equal "0" "$final_reservation" "final reservation is not zero"
}

function run_multiple_cgroup_test() {
  local size1="$1"
  local populate1="$2"
  local write1="$3"
  local cgroup_limit1="$4"
  local reservation_limit1="$5"

  local size2="$6"
  local populate2="$7"
  local write2="$8"
  local cgroup_limit2="$9"
  local reservation_limit2="${10}"

  local nr_hugepages="${11}"
  local method="${12}"
  local private="${13}"
  local expect_failure="${14}"
  local reserve="${15}"

  # Function return values.
  hugetlb_difference1=0
  reserved_difference1=0
  reservation_failed1=0
  oom_killed1=0

  hugetlb_difference2=0
  reserved_difference2=0
  reservation_failed2=0
  oom_killed2=0

  echo nr hugepages = "$nr_hugepages"
  echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages

  setup_cgroup "hugetlb_cgroup_test1" "$cgroup_limit1" "$reservation_limit1"
  setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2"

  mkdir -p /mnt/huge
  mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge

  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \
    "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \
    "$expect_failure" "$reserve"

  hugetlb_difference1=$hugetlb_difference
  reserved_difference1=$reserved_difference
  reservation_failed1=$reservation_failed
  oom_killed1=$oom_killed

  local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file
  local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file
  local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file
  local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file

  local usage_before_second_write=$(cat $cgroup1_hugetlb_usage)
  local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage)

  write_hugetlbfs_and_get_usage "hugetlb_cgroup_test2" "$size2" \
    "$populate2" "$write2" "/mnt/huge/test2" "$method" "$private" \
    "$expect_failure" "$reserve"

  hugetlb_difference2=$hugetlb_difference
  reserved_difference2=$reserved_difference
  reservation_failed2=$reservation_failed
  oom_killed2=$oom_killed

  expect_equal "$usage_before_second_write" \
    "$(cat $cgroup1_hugetlb_usage)" "Usage changed."
  expect_equal "$reservation_usage_before_second_write" \
    "$(cat $cgroup1_reservation_usage)" "Reservation usage changed."

  cleanup_hugetlb_memory

  local final_hugetlb=$(cat $cgroup1_hugetlb_usage)
  local final_reservation=$(cat $cgroup1_reservation_usage)

  expect_equal "0" "$final_hugetlb" \
    "hugetlbt_cgroup_test1 final hugetlb is not zero"
  expect_equal "0" "$final_reservation" \
    "hugetlbt_cgroup_test1 final reservation is not zero"

  local final_hugetlb=$(cat $cgroup2_hugetlb_usage)
  local final_reservation=$(cat $cgroup2_reservation_usage)

  expect_equal "0" "$final_hugetlb" \
    "hugetlb_cgroup_test2 final hugetlb is not zero"
  expect_equal "0" "$final_reservation" \
    "hugetlb_cgroup_test2 final reservation is not zero"
}

cleanup

for populate in "" "-o"; do
  for method in 0 1 2; do
    for private in "" "-r"; do
      for reserve in "" "-n"; do

        # Skip mmap(MAP_HUGETLB | MAP_SHARED). Doesn't seem to be supported.
        if [[ "$method" == 1 ]] && [[ "$private" == "" ]]; then
          continue
        fi

        # Skip populated shmem tests. Doesn't seem to be supported.
        if [[ "$method" == 2"" ]] && [[ "$populate" == "-o" ]]; then
          continue
        fi

        if [[ "$method" == 2"" ]] && [[ "$reserve" == "-n" ]]; then
          continue
        fi

        cleanup
        echo
        echo
        echo
        echo Test normal case.
        echo private=$private, populate=$populate, method=$method, reserve=$reserve
        run_test 5 "$populate" "" 10 10 10 "$method" "$private" "0" "$reserve"

        echo Memory charged to hugtlb=$hugetlb_difference
        echo Memory charged to reservation=$reserved_difference

        if [[ "$populate" == "-o" ]]; then
          expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \
            "Reserved memory charged to hugetlb cgroup."
        else
          expect_equal "0" "$hugetlb_difference" \
            "Reserved memory charged to hugetlb cgroup."
        fi

        if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then
          expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \
            "Reserved memory not charged to reservation usage."
        else
          expect_equal "0" "$reserved_difference" \
            "Reserved memory not charged to reservation usage."
        fi

        echo 'PASS'

        cleanup
        echo
        echo
        echo
        echo Test normal case with write.
        echo private=$private, populate=$populate, method=$method, reserve=$reserve
        run_test 5 "$populate" '-w' 5 5 10 "$method" "$private" "0" "$reserve"

        echo Memory charged to hugtlb=$hugetlb_difference
        echo Memory charged to reservation=$reserved_difference

        expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \
          "Reserved memory charged to hugetlb cgroup."

        expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \
          "Reserved memory not charged to reservation usage."

        echo 'PASS'

        cleanup
        continue
        echo
        echo
        echo
        echo Test more than reservation case.
        echo private=$private, populate=$populate, method=$method, reserve=$reserve

        if [ "$reserve" != "-n" ]; then
          run_test "5" "$populate" '' "10" "2" "10" "$method" "$private" "1" \
            "$reserve"

          expect_equal "1" "$reservation_failed" "Reservation succeeded."
        fi

        echo 'PASS'

        cleanup

        echo
        echo
        echo
        echo Test more than cgroup limit case.
        echo private=$private, populate=$populate, method=$method, reserve=$reserve

        # Not sure if shm memory can be cleaned up when the process gets sigbus'd.
        if [[ "$method" != 2 ]]; then
          run_test 5 "$populate" "-w" 2 10 10 "$method" "$private" "1" "$reserve"

          expect_equal "1" "$oom_killed" "Not oom killed."
        fi
        echo 'PASS'

        cleanup

        echo
        echo
        echo
        echo Test normal case, multiple cgroups.
        echo private=$private, populate=$populate, method=$method, reserve=$reserve
        run_multiple_cgroup_test "3" "$populate" "" "10" "10" "5" \
          "$populate" "" "10" "10" "10" \
          "$method" "$private" "0" "$reserve"

        echo Memory charged to hugtlb1=$hugetlb_difference1
        echo Memory charged to reservation1=$reserved_difference1
        echo Memory charged to hugtlb2=$hugetlb_difference2
        echo Memory charged to reservation2=$reserved_difference2

        if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then
          expect_equal "3" "$reserved_difference1" \
            "Incorrect reservations charged to cgroup 1."

          expect_equal "5" "$reserved_difference2" \
            "Incorrect reservation charged to cgroup 2."

        else
          expect_equal "0" "$reserved_difference1" \
            "Incorrect reservations charged to cgroup 1."

          expect_equal "0" "$reserved_difference2" \
            "Incorrect reservation charged to cgroup 2."
        fi

        if [[ "$populate" == "-o" ]]; then
          expect_equal "3" "$hugetlb_difference1" \
            "Incorrect hugetlb charged to cgroup 1."

          expect_equal "5" "$hugetlb_difference2" \
            "Incorrect hugetlb charged to cgroup 2."

        else
          expect_equal "0" "$hugetlb_difference1" \
            "Incorrect hugetlb charged to cgroup 1."

          expect_equal "0" "$hugetlb_difference2" \
            "Incorrect hugetlb charged to cgroup 2."
        fi
        echo 'PASS'

        cleanup
        echo
        echo
        echo
        echo Test normal case with write, multiple cgroups.
        echo private=$private, populate=$populate, method=$method, reserve=$reserve
        run_multiple_cgroup_test "3" "$populate" "-w" "10" "10" "5" \
          "$populate" "-w" "10" "10" "10" \
          "$method" "$private" "0" "$reserve"

        echo Memory charged to hugtlb1=$hugetlb_difference1
        echo Memory charged to reservation1=$reserved_difference1
        echo Memory charged to hugtlb2=$hugetlb_difference2
        echo Memory charged to reservation2=$reserved_difference2

        expect_equal "3" "$hugetlb_difference1" \
          "Incorrect hugetlb charged to cgroup 1."

        expect_equal "3" "$reserved_difference1" \
          "Incorrect reservation charged to cgroup 1."

        expect_equal "5" "$hugetlb_difference2" \
          "Incorrect hugetlb charged to cgroup 2."

        expect_equal "5" "$reserved_difference2" \
          "Incorrected reservation charged to cgroup 2."
        echo 'PASS'

        cleanup

      done # reserve
    done   # private
  done     # populate
done       # method

if [[ $do_umount ]]; then
  umount $cgroup_path
  rmdir $cgroup_path
fi

echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages
