| #!/bin/sh |
| # SPDX-License-Identifier: GPL-2.0-only |
| |
| . ./eeh-functions.sh |
| |
| if ! eeh_supported ; then |
| echo "EEH not supported on this system, skipping" |
| exit 0; |
| fi |
| |
| if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \ |
| [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then |
| echo "debugfs EEH testing files are missing. Is debugfs mounted?" |
| exit 1; |
| fi |
| |
| pre_lspci=`mktemp` |
| lspci > $pre_lspci |
| |
| # Bump the max freeze count to something absurd so we don't |
| # trip over it while breaking things. |
| echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes |
| |
| # record the devices that we break in here. Assuming everything |
| # goes to plan we should get them back once the recover process |
| # is finished. |
| devices="" |
| |
| # Build up a list of candidate devices. |
| for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do |
| # skip bridges since we can't recover them (yet...) |
| if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then |
| echo "$dev, Skipped: bridge" |
| continue; |
| fi |
| |
| # Skip VFs for now since we don't have a reliable way |
| # to break them. |
| if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then |
| echo "$dev, Skipped: virtfn" |
| continue; |
| fi |
| |
| # Don't inject errosr into an already-frozen PE. This happens with |
| # PEs that contain multiple PCI devices (e.g. multi-function cards) |
| # and injecting new errors during the recovery process will probably |
| # result in the recovery failing and the device being marked as |
| # failed. |
| if ! pe_ok $dev ; then |
| echo "$dev, Skipped: Bad initial PE state" |
| continue; |
| fi |
| |
| echo "$dev, Added" |
| |
| # Add to this list of device to check |
| devices="$devices $dev" |
| done |
| |
| dev_count="$(echo $devices | wc -w)" |
| echo "Found ${dev_count} breakable devices..." |
| |
| failed=0 |
| for dev in $devices ; do |
| echo "Breaking $dev..." |
| |
| if ! pe_ok $dev ; then |
| echo "Skipping $dev, Initial PE state is not ok" |
| failed="$((failed + 1))" |
| continue; |
| fi |
| |
| if ! eeh_one_dev $dev ; then |
| failed="$((failed + 1))" |
| fi |
| done |
| |
| echo "$failed devices failed to recover ($dev_count tested)" |
| lspci | diff -u $pre_lspci - |
| rm -f $pre_lspci |
| |
| exit $failed |