Network Configs & Performance Tuning - alex-aleyan/linux_wiki GitHub Wiki

Tools:

  • Collectd (Monitor CPU, memory, and network utilization)
  • Oprofile (allows the user to monitor the sw threads, excessive processing/loops, etc).
  • ntrace (NightTrace)
  • ntune (NightTune)
  • top
  • cpuid
  • /PROC:
    • /proc/
    • /proc/stat
    • /proc/cmdline
    • /proc/cpuinfo
    • /proc/interrupts
    • /proc/diskstats and /sys/block/<evice</stat
  • Memory Tools:
    • vmstat
    • valgrind
    • pmap
    • slabtop
    • free
  • Network
    • netstat
    • ethtool
    • traceroute & tracepath

Configuring static networking (for intra-networks only):

  1. Configure web access:

    nmtui
    
  2. Disable NetworkManager if installed:

    service NetworkManager stop
    service NetworkManager status
    chkconfig NetworkManager off
    chkconfig NetworkManager --list
    
  3. Restart the network service and make sure that the interface came up:

    chkconfig --level 2345  on
    chkconfig network --list
    service network restart
    ifconfig
    
  4. Disable IP Tables:

    service iptables stop
    service iptables status
    chkconfig iptables off
    chkconfig iptables --list
    
  5. Disable SElinux Tables:

    vim /etc/sysconfig/selinux
    SELINUX=disabled
    
  6. Install rpcbind from CD:

    yum install nfs-utils rpcbind
    chkconfig rpcbind on
    service rpcbind start
    

Tuning network performance for maximum thruput:

Long

Clean XES

Current BIOS Settings

  • BIOS→Setup Utilities→Advanced→
    • Advanced→Peripheral Configuration→PCIe Max Read Request Size Auto???
    • Processor Configuration->Hyper-Threading[ALL]
    • Advanced Power Management Configuration→CPU P State Control
      • **Energy Efficient P_State **
      • **Boot Performance mode **
      • Turbo Mode
    • Advanced Power Management Configuration
      • CPU C State Control->CPU C State
      • CPU-Advanced PM Tuning→Energy Perf BIAS
        • Power Performance Tuning
        • ENERGY_PERF_BIAS_CFG mode
        • Power/Performance Switch
        • Wokrload Configuration ???
      • Program powerCTL_MSR->EnergyEfficient Turbo
    • Memory Configuration ->
      • Memory Thermal -> Memory Power Savings Mode
      • Memory Timing & Voltage Override/Memory Frequency <auto???>

Optional: Install OFED (MLNX_OFED_LINUX-3.4-1.0.0.0 (OFED-3.4-1.0.0))

[root@hn_server ~]# yum install --installroot=/opt/chroots/hpc_os_images/rhel-xes_hpc_new python-devel lsof redhat-rpm-config rpm-build libxml2-python gcc kernel-devel-2.6.32-642.el6.x86_64
[root@hn_server ~]# yum install --installroot=/opt/chroots/hpc_os_images/rhel-xes_hpc_new gtk2 atk cairo gcc-gfortran numactl tcsh tk dmidecode
[root@hn_server ~]# cp /lib/modules/2.6.32-642.el6.x86_64 rhel-xes_hpc_new/lib/modules/
[root@hn_server ~]# cp /pathtoDrivers/MLNX_OFED_LINUX-3.4-1.0.0.0-rhel6.8-x86_64 opt/chroots/hpc_os_images//rhel-xes_hpc_new/root/
[root@hn_server ~]# chroot /opt/chroots/hpc_os_images//rhel-xes_hpc_new/
[root@hn_server ~]# cd /root/MLNX_OFED_LINUX-3.4-1.0.0.0-rhel6.8-x86_64
[root@hn_server /]# ./mlnxofedinstall --add-kernel-support --skip-repo
[root@blade01 ~]# mlnx_tune -p THROUGHPUT

Install cpupower command

[root@hn_server ~]# yum remove --installroot=/opt/chroots/hpc_os_images//rhel-xes_hpc_new/ cpufrequtils
[root@hn_server ~]# yum install --installroot=/opt/chroots/hpc_os_images//rhel-xes_hpc_new cpupowerutils

(USE METHOD SHOWN NEXT INSTEAD OF THIS METHOD) Configure NICs as ETH

[root@hn_server ~]# ssh blade01  
[root@blade01 ~]# /sbin/connectx_port_config
[root@hn_server ~]# vi /opt/chroots/hpc_os_images//rhel-xes_hpc_new/etc/infiniband/connectx.conf
# ConnectX Port Configuration for 0000:03:00.0
/sbin/connectx_port_config -d 0000:03:00.0 -c eth
# ConnectX Port Configuration for 0000:85:00.0
/sbin/connectx_port_config -d 0000:85:00.0 -c eth

/etc/rc.d/S Add this script:*

[root@hn_server1 ~]# vi /etc/rc.d/rc5.d/S10network

MLX_PORTS=($(find /sys -name "mlx4_port1"))
for PORT in ${MLX_PORTS[@]}
do
  echo eth > $PORT
done

/etc/rc.d/rc5.d/S99local

[root@hn_server1 ~]# vi /etc/rc.d/rc5.d/S99local

#SET CPU GOV TO PERFORMANCE (PROBLEM: DOES NOT SET ALL CPUS to 1.8GHz!!!!):
cpupower --perf-bias 0
cpupower frequency-set -g performance
if [[ $? -ne 0 ]]; then
 NUMOFCPUS=$(nproc --all)
 i=0
 while test $i -ne ${NUMOFCPUS}; do
  cpufreq-set -c $i -g performance
  echo -n "CPU${i}: "; cpufreq-info -c $i | grep "current cpu freq is"
  ((i++))
  done
fi

#SET CPU GOV TO PERFORMANCE using cpupower utility (better than cpufreq-set):
cpupower frequency-set -g performance

#for current setup with 2 blades (use ethtool -i eth3 to determine the bus):
setpci -s 03:00.0 68.w=5936
setpci -s 85:00.0 68.w=5936

ifconfig eth3 mtu 9000 txqueuelen 100000
ifconfig eth4 mtu 9000 txqueuelen 100000

#configure a single Ring Buffer since using UDP:
ethtool -L eth3 tx 1 rx 1
ethtool -L eth4 tx 1 rx 1

#Maximize the depth of the Ring Buffer:
ethtool -G eth3 rx 8192 tx 4096
ethtool -G eth4 rx 8192 tx 4096

#Increase the budget:
sysctl -w net.core.netdev_budget=600

modprobe coretemp
modprobe ipmi-si

vim /etc/sysctl.conf

[root@test1 ~]# vi /etc/sysctl.conf
net.core.rmem_max = 536870912 #536870912 Bytes is 512 * 2^20 where 1 * 2^20 is 1 MB not Mb
net.core.wmem_max = 536870912 #512MB
net.core.rmem_default = 536870912 #512MB
net.core.wmem_default = 536870912 #512MB
net.core.netdev_max_backlog = 40000

net.ipv4.tcp_rmem = 4096 33554432 536870912 # 4KB, 32MB, 512MB
net.ipv4.tcp_wmem = 4096 33554432 536870912 # 4KB, 32MB, 512MB
net.ipv4.tcp_mem = 33554432 33554432 33554432 # 32MB, 32MB, 32MB
net.ipv4.tcp_mtu_probing = 1
net.ipv4.tcp_slow_start_after_idle = 0

net.ipv4.route.flush = 1

net.ipv4.conf.all.arp_filter = 1
net.ipv4.conf.all.rp_filter = 0
net.ipv4.neigh.default.gc_thresh1 = 8192 # 8MB
net.ipv4.neigh.default.gc_thresh2 = 8192 # 8MB
net.ipv4.neigh.default.gc_thresh3 = 8192 # 8MB
net.ipv4.neigh.default.gc_stale_time = 3600

Force IGMP version 2

net.ipv4.conf.all.force_igmp_version = 2
net.ipv4.conf.default.force_igmp_version = 2

#figure these ones out:
net.ipv4.udp_mem = 12364032 16485376 24728064 ???
net.ipv4.udp_rmem_min = 4096 ???
net.ipv4.udp_wmem_min = 4096 ???

ALL

NIC

watch -n 0.1 "ethtool --statistics eth3 | grep -E \ "offload|desc|jabber||tx_packets|tx_bytes|rx_packets|rx_bytes|error|drop| \
collisions|aggregated|flushed|stop|timeout|fail|filter|clean\" 
ethtool -l eth3
ethtool -L eth3 tx <NUM> rx <NUM>
ethtool -g eth3
ethtool -L eth3 tx <NUM> rx <NUM>

KERNEL

watch -d -n 1 'netstat -su' 
watch -d -n 1 'netstat -i' 
watch -d -n 1 'netstat -nau' 

NUMA CPU

trubostat -interval 1 --debug
watch -d numastat -cn

MEMORY

watch -d numactl -H
watch -d numastat -mc
watch -d numastat -cn

IRQ

watch -d cat /proc/interrupts

cat /proc/irq/<IRQ_NUM>/smp_affinity # IRQ_NUM found from /proc/interrupts file
echo ffff > /proc/irq/<IRQNUM>/smp_affinity

cat /proc/irq/<IRQ_NUM>/smp_affinity_list # IRQ_NUM found from /proc/interrupts file
echo 10 > /proc/irq/<IRQNUM>/smp_affinity_list

Sources

https://community.mellanox.com/docs/DOC-2490
https://www.coverfire.com/articles/queueing-in-the-linux-network-stack/
https://access.redhat.com/sites/default/.../20150325_network_performance_tuning.pdf
https://gist.github.com/wmealing/2dd2b543c4d3cff6cab7
http://www.breakage.org/2012/11/14/processor-max_cstate-intel_idle-max_cstate-and-devcpu_dma_latency \

ethtool -i eth3                                     #Determine PCI bus per interface
cat /sys/class/net/eth3/device/numa_node            #Determine NUMA node per interface
view /sys/bus/pci/devices/0000\:83\:00/numa_node    #Determine NUMA node per PCI bus

numactl -H | grep "node 1 cpus"
numactl --show
numactl -H

ethtool eth3
ethtool -g eth3
ethtool -k eth3
ethtool -l eth3
netstat -r
#netstat -M

sysctl net.ipv4
sysctl net.core

ON THE RUN:
ethtool -c eth3
ethtool -a eth3
netstat --interfaces=eth3
netstat -su
netstat -neopa

egrep "CPU0|eth3" /proc/interrupts 

cat /proc/net/softnet_stat #
#1. if all columns except 1st should have 0s; double the net.core.netdev_budget value if not
sysctl net.core.netdev_budget
  net.core.netdev_budget = 300
sysctl -w net.core.netdev_budget=600
#2. If the second column has non zero value while the corresponding third column value is a zero, adjust the buffer depth and increase the net.core.netdev_max_backlog
sysctl net.core.netdev_max_backlog
  net.core.netdev_max_backlog = 1000
sysctl -w net.core.netdev_max_backlog=1500
ethtool -g
ethtool -G eth4 rx 2048tx 1024


watch -n 1 "egrep \"CPU|eth3\" /proc/interrupts"
watch -n 0.1 'cat /proc/net/softnet_stat'
watch -n 0.1 "ethtool --statistics eth3 | grep -E \"tx_packets|tx_bytes|rx_packets|rx_bytes|errors|dropped|collisions|aggregated|flushed|stopped|timeout|fail|filtered|clean\" "
watch -n 0.1 "netstat --interfaces=eth3; netstat --statistics"
watch -n 0.1 "netstat --interfaces=eth3; netstat --statistics""
netstat --route

if "ethtool --statistics | grep queue_stopped" keeps incrementing
[root@localhost Desktop] ethtool -l eth3
Channel parameters for eth3:
Pre-set maximums:
RX:		128
TX:		32
Other:		0
Combined:	0
Current hardware settings:
RX:		16
TX:		28
Other:		0
Combined:	0
[root@localhost Desktop]# ssh hpc_node02 ethtool -L eth3 rx 1 tx 1

[root@localhost Desktop]ethtool -g eth3
Ring parameters for eth3:
Pre-set maximums:
RX:		8192
RX Mini:	0
RX Jumbo:	0
TX:		8192
Current hardware settings:
RX:		1024
RX Mini:	0
RX Jumbo:	0
TX:		512
[root@localhost Desktop]ethtool -G eth3 rx 8192 tx 4096

[root@localhost Desktop] ifconfig eth3 mtu 9000 txquelen 10000

mlnx_tune -p HIGH_THROUGHPUT

ethtool -C
sysctl -w

cat /sys/class/net/eth3/duplex
cat /sys/class/net/eth3/statistics


Problem Notes:

 egrep "CPU|eth3" /proc/interrupts
  CPU13 counts up
  
 ethtool --statistics eth3:
  rx_crc_errors: counts up every other time by 1
  vport_rx_filtered: counts up every other time by 1

 qperf:
  msg_rate = 180 K/sec vs the good msg_rate = 516 K/sec
  msg_size =  8.97 KB same as good msg_size = 8.97 KB
  recv_cost = 1.26~1.56 sec/GB = 3 * send_cost; where send_cost = 422~466 ms/GB 
  

Current XES Settings

BIOS→Setup Utilities→Advanced→Advanced→Peripheral Configuration→PCIe Max Read Request Size Auto???

BIOS→Setup Utilities→Advanced→Processor Configuration->Hyper-Threading[ALL]

**BIOS→Setup Utilities→Advanced→Advanced Power Management Configuration→CPU P State Control->Energy Efficient P_State **

**BIOS→Setup Utilities→Advanced→Advanced Power Management Configuration→CPU P State Control->Advanced/Advanced Power Management Configuration/CPU P State Control/Boot Performance mode **

BIOS→Setup Utilities→Advanced→Advanced Power Management Configuration→CPU C State Control->Enhanced Halt State(C1E) BIOS→Setup Utilities→Advanced→Advanced Power Management Configuration→CPU C State Control->Package C State Limit <C0/C1 State> BIOS→Setup Utilities→Advanced→Advanced Power Management Configuration→CPU C State Control->CPU C State

BIOS→Setup Utilities→Advanced→Advanced Power Management Configuration→CPU-Advanced PM Tuning→Energy Perf BIAS->Power Performance Tuning BIOS→Setup Utilities→Advanced→Advanced Power Management Configuration→CPU-Advanced PM Tuning→Energy Perf BIAS->ENERGY_PERF_BIAS_CFG mode BIOS→Setup Utilities→Advanced→Advanced Power Management Configuration→CPU-Advanced PM Tuning→Energy Perf BIAS->Power/Performance Switch BIOS→Setup Utilities→Advanced→Advanced Power Management Configuration→CPU-Advanced PM Tuning→Program powerCTL_MSR->EnergyEfficient Turbo

BIOS→Setup Utilities→Advanced→Memory Thermal -> Memory Power Savings Mode BIOS→Setup Utilities→Advanced→Memory Timing & Voltage Override/Memory Frequency <auto???>

Configuring BIOS settings####

  1. In BIOS->Setup Utilities->Advanced->Processor Configuration: Hyper-Threading[ALL]
  2. In BIOS->Setup Utilities->Advanced->Advanced Power Management Configuration->CPU P State Control: Energy efficient P-State Boot performance mode Turbo Mode
    Advanced/Advanced Power Management Configuration/CPU P State Control/Turbo Mode
  3. In BIOS->Setup Utilities->Advanced->Advanced Power Management Configuration->CPU C State Control: CPU C State
  4. In BIOS->Setup Utilities->Advanced->Advanced Power Management Configuration->CPU-Advanced PM Tuning->Energy Perf BIAS: Power Performance Tuning ENERGY_PERF_BIAS_CFG mode Power/Performance Switch Workload Configuration

WHAT SHOULD I DO ABOUT THESE VALUES: Averaging Time Window [23] ??? P0 TotalTimeThreshold Low [35] ??? P0 TotalTimeThreshold High [58] ???

4.1 In BIOS->Setup Utilities->Advanced->Advanced Power Management Configuration->CPU-Advanced PM Tuning->Program powerCTL_MSR: EnergyEfficient Turbo 5. BIOS→Setup Utilities→Advanced→Memory Thermal: Memory Power Savings Mode

4.2 In BIOS->Setup Utilities->Advanced->Advanced Power Management Configuration->CPU-Advanced PM Tuning->Program PP0_CURT_CFG_CTRL_MSR: PP0 Current_Cfg_Ctl Ovrd ???? Current Config ????

4.2.1 In BIOS->Setup Utilities->Advanced->Advanced Power Management Configuration->CPU-Advanced PM Tuning->Program PP0_CURT_CFG_CTRL_MSR->PSI CONFIG: PSI3 Threshold [1] ?????? PSI2 Threshold [5] ?????? PSI1 Threshold [20] ?????? Lock Indication ??????

4.3 In BIOS->Setup Utilities->Advanced->Advanced Power Management Configuration->CPU-Advanced PM Tuning->Program CSR_ENTRY_CRITERIA: PKGC_ENTRY_CRITERIA OVRD ???SET TO MANUAL???

CPU0Advanced PM Turning CPU0Advanced PM Turning

4.3.1 In BIOS->Setup Utilities->Advanced->Advanced Power Management Configuration->CPU-Advanced PM Tuning->Program CSR_ENTRY_CRITERIA->CPU0Advanced PM Turning: QPI_0_IN CPU0 QPI_1_IN ?????? CPU0 QPI_2_IN ?????? PCIE Port0 PCIE_IN ?????? PCIE Port1 PCIE_IN ?????? PCIE Port2 PCIE_IN ?????? PCIE Port3 PCIE_IN ?????? PCIE Port4 PCIE_IN ?????? PCIE Port5 PCIE_IN ?????? PCIE Port6 PCIE_IN ?????? PCIE Port7 PCIE_IN ?????? PCIE Port8 PCIE_IN ?????? PCIE Port9 PCIE_IN ?????? PCIE Port10 PCIE_IN ??????

4.4 In BIOS->Setup Utilities->Advanced->Advanced Power Management Configuration->CPU-Advanced PM Tuning->Program CSR_SWLTROVRD: Snoop Latency Valid ?????? Snoop Latency Override ?????? Snoop Latency Multiplier ?????? Snoop Latency Value ?????? Non-Snoop Latency Valid ?????? Non-Snoop Latency Override ?????? NonSnoop LatencyMultiplier [0] ?????? Non-Snoop Latency Value [0] ??????

  1. P-State: if enabled, all CPUs on a given NUMA node will go to "sleep" mode in case there is no activity. P-State
  2. C-State: Each CPU has several power modes called “C-states” or “C-modes.” Lowers CPU power when it idles. C-State Verify that the C-State is diabled: cat /sys/module/intel_idle/parameters/max_cstate # we want it to be 0 (CPU fully turned on; no sleep). 0 If not disabled (greater than 0), use: kernel arguments: processor.max_cstate=1 intel_idle.max_cstate=0 "wwignoremod=mlx4_en,mlx4_core,mlx4_ib,cxb3,processor.max_cstate=1 intel_idle.max_cstate=0"
  3. Turbo Mode (Intel): runs the processor core faster than the noted frequency. Turbo Mode
  4. Hyper Threading: Allows a CPU to work on multiple streams of data simultaneously. Hyper Threading if CPU operates with single-threaded tasks, or CPU is close to 100% utilization. Hyper Threading if CPU operates with multi-threaded tasks.
  5. IO Non Posted Prefetching: relevant to haswell/broadwell. Not alwaus exposed on all BIOS versions. IO Non Posted Prefetching
  6. CPU Frequency: should be maxed.
  7. Memory Speed: should be maxed.
  8. Memory Channel Mode: specifies if the memory channel has its own memory controller that operates the memory at full speed. Memory Channel Mode
  9. Node Interleaving: enabling Node Interleaving means that memory is interleaved between memory nodes, and there is no NUMA presentation to the operating system. When disabled, NUMA mode is enabled. We want NUMA mode to be enabled. Node Interleaving
  10. Channel Interleaving: Channel interleaving splits the RAM into sections to enable multiple r/w at the same time.
  11. Thermal Mode: high power higher fan speeds Thermal Mode
  12. HPC Optimizations: similar to C-state mode as they are supported in AMD processors only. HPC Optimizations
  13. BIOS Settings Example: Main window: Quite Boot disabled POST Error Pause disabled Advanced -> Processor Configuration: Inter(R) Hyper-Threading Tech ???Should be disabled???? Active Processor Cores Execute Disable Bit Inter (R) Virtualization Inter (R) TXT Enhanced Error Containment Mode MLC Streamer MLC Spatial Prefetcher DCU Data Prefetcher DCU Instruction Prefetcher Direct Cache Access (DCA) Extended ATR (0x03> PFloor Tuning [12] Advanced -> Power and Performance: CPU Power and Performance Policy Workload Configuration Advanced -> Uncore Power Management: Uncore Frequency Scaling Performance P-Limit Advanced -> CPU P-State Control: Enhanced Intel SpeedStep(R) Tech # should probably be set to instead??? Inter Configurable TDP Intel(R) Turbo Boost Technology Energy Efficient Turbo ????Should be Disabled????? Advanced -> CPU C-State Control: CPU C-State C1E Autopromote Processor C3 Processor C6 System Acoustic and Performance Configuration: Set Fan Profile Fan PWM Offset [0]

Configuring CPU settings with CPUPOWER command####

  1. Remove cpufrequtils: yum remove --installroot=/opt/chroots/hpc_os_images//rhel/ cpufrequtils
  2. Install cpupowerutils yums install --installroot=/opt/chroots/hpc_os_images//rhel/ cpupowerutils
  3. Build vnfs (STILL NEED TO FIGURE OUT HOW TO MAKE IT PERMANENT): cpupower frequency-info 21.1. On the node: cpupower frequecy-set --freq 1800000 21.2. Short Script for you /opt/chroots/hpc_os_images//rhel-hpc/etc/rc5.d/S99local:
#SET CPU GOV TO PERFORMANCE: cpupower --perf-bias 0 cpupower --sched-mc 0 cpupower --sched-smt 0 cpupower frequency-set -g performance if [[ $? -ne 0 ]]; then NUM_OF_CPUS=$(nproc --all) i=0 while test $i -ne ${NUM_OF_CPUS}; do cpufreq-set -c $i -g performance echo -n "CPU${i}: "; cpufreq-info -c $i | grep " current CPU frequency is" ((i++)) done fi

#for current setup with 2 blades (use ethtool -i eth3 to determine the bus): setpci -s 03:00.0 68.w=5936 setpci -s 85:00.0 68.w=5936

#configure a single Ring Buffer since using UDP: ethtool -L eth3 tx 1 rx 1 ethtool -L eth4 tx 1 rx 1

#Maximize the depth of the Ring Buffer: ethtool -G eth3 rx 8192 tx 4096 ethtool -G eth4 rx 8192 tx 4096

ifconfig eth3 mtu 9000 txqueuelen 100000 ifconfig eth4 mtu 9000 txqueuelen 100000

#Increase the budget: sysctl -w net.core.netdev_budget=600

modprobe coretemp modprobe ipmi-si

touch /var/lock/subsys/local

Configuring CPU settings with CPUFREQ command####

  1. Install cpupowerutils yums install --installroot=/opt/chroots/hpc_os_images//rhel/ cpufrequtils
  2. Build vnfs (STILL NEED TO FIGURE OUT HOW TO MAKE IT PERMANENT): cpufreq-info -c CPU_NUM
  3. On the node: cpufreq-set -c $CPU_NUM -g performance 21 Short Script for you /etc/rc5.d/S99local:
#SET CPU GOV TO PERFORMANCE: cpupower --perf-bias 0 cpupower --sched-mc 0 cpupower --sched-smt 0 cpupower frequency-set -g performance if [[ $? -ne 0 ]]; then NUM_OF_CPUS=$(nproc --all) i=0 while test $i -ne ${NUM_OF_CPUS}; do cpufreq-set -c $i -g performance echo -n "CPU${i}: "; cpufreq-info -c $i | grep " current CPU frequency is" ((i++)) done fi

#for current setup with 2 blades (use ethtool -i eth3 to determine the bus): setpci -s 03:00.0 68.w=5936 setpci -s 85:00.0 68.w=5936

#configure a single Ring Buffer since using UDP: ethtool -L eth3 tx 1 rx 1 ethtool -L eth4 tx 1 rx 1

#Maximize the depth of the Ring Buffer: ethtool -G eth3 rx 8192 tx 4096 ethtool -G eth4 rx 8192 tx 4096

ifconfig eth3 mtu 9000 txqueuelen 100000 ifconfig eth4 mtu 9000 txqueuelen 100000

#Increase the budget: sysctl -w net.core.netdev_budget=600

modprobe coretemp modprobe ipmi-si

touch /var/lock/subsys/local

Mellanox Proceedure

Note:

  • The PCIe generation has to suit the adapter (In most cases, PCIe Gen3 which operates at 8Gt/s).
  • Using x16 slot is beneficial even when 8x adapter is used since additional buffers allocated by CPU.
  • Make sure NUMA architecture of the motherboard architecture is well understood and configured.
  • Implement mlnc_tune to tune the system.
  1. You can adjust receive and transmit buffer sizes using: ethtool -g eth4 ethtool -G eth0 rx 4096 tx 2048
  2. Determine Speed in GT/s (Gen1=2.5GT/s, Gen2=5GT/s, and Gen3=8GT/s where GT/s stands for Billion Transaction per Second; Pay attention to LnkCap stands for Link Capabilities and LnkSta stands for Link Status): ethtool -i eth3 driver: mlx4_en version: 3.4-1.0.0 (25 Sep 2016) firmware-version: 2.40.5000 bus-info: 0000:03:00.0 supports-statistics: yes supports-test: yes supports-eeprom-access: no supports-register-dump: no supports-priv-flags: yes lspci -s 03:00.0 -vvv | grep Speed LnkCap: Port #8, Speed 8GT/s, Width x8 LnkSta: Speed 5GT/s, Width x8
  3. Check lspci Width: lspci -s 03:00.0 -vvv | grep Width # determine pci lanes (Mellanox supports x8 and x16)
  4. You can also determine Generation instead of speed: lspci -s 03:00.0 -vvv | grep "PCIe Gen" [V0] Vendor Specific: PCIe Gen3 x8
  5. Determine PCIe Max Read Request (determines the maximal PCIe read request allowed (affects the number of pending requests when using data fetch larger than the PCIe MTU): lspci -s 03:00.0 -vvv | grep DevCtl: -C 2
  6. PCIe MaxReadRequest can be changed using setpci command during the run time or when placed in /etc/rc.d/rc.local ( 0 - 128B, 1 - 256B, 2 - 512B, 3 - 1024B, 4 - 2048B and 5 - 4096B): lspci -s 03:00.0 -vvv | grep MaxReadReq setpci -s 03:00.0 68.w=0936 setpci -s 03:00.0 68.w=1936 . . . setpci -s 03:00.0 68.w=5936
  7. Calculating PCIe Limitations (Notice 1Gb/s due to for error correction protocols and PCIe header overheads): Maximum PCIe Bandwidth = Speed * Width * (1 - ENCODING) - 1Gb/s
# mlnx_tune -h Usage: mlnx_tune [options]

Options: -h, --help show this help message and exit -d, --debug_info dump system debug information without setting a profile -r, --report Report HW/SW status and issues without setting a profile -c, --colored Switch using colored/monochromed status reports. Only applicable with --report -p PROFILE, --profile=PROFILE Set profile and run it. choose from: ['HIGH_THROUGHPUT', 'IP_FORWARDING_MULTI_STREAM_THROUGHPUT', 'IP_FORWARDING_MULTI_STREAM_PACKET_RATE', 'IP_FORWARDING_SINGLE_STREAM', 'IP_FORWARDING_SINGLE_STREAM_0_LOSS', 'IP_FORWARDING_SINGLE_STREAM_SINGLE_PORT', 'LOW_LATENCY_VMA'] -q, --verbosity print debug information to the screen [default False] -v, --version print tool version and exit [default False] -i INFO_FILE_PATH, --info_file_path=INFO_FILE_PATH info_file path. [default %s]

mlnx_tune -r

mlnx_tune -p HIGH_THROUGHPUT

  1. To correct the issue when the interface type comes up as INFINIBAND, run the command below on a booted cluster and set the interface to eth: [root@hpc ~] /sbin/connectx_port_config
  2. The command above generates some content into the /etc/ifiniband/connectx.conf file, copy the generated content to connectx.conf file on your Master Node: [root@hn_server ~]# vi /opt/chroots/hpc_os_images//rhel-hpc/etc/ifiniband/connectx.conf /sbin/connectx_port_config -d 0000:83:00.0 -c eth

Setup QPERF

Resources: https://fasterdata.es.net/network-tuning/udp-tuning \

Consider:

  • Using jumbo frames: performance will be 4-5 times better using 9K MTUs.
  • The packet size: best performance is MTU size minus packet header size. For example, when using a 9000 Byte MTU, set the packet size (payload) to 8972 for IPV4 (28 byte header), and 8952 for IPV6 (48 byte header).
  • The socket buffer size: For UDP, buffer size is not related to RTT the way TCP is, but the defaults are still not large enough. Setting the socket buffer to 4M seems to help a lot in most cases.
  • core selection: UDP at 10G is typically CPU limited, so its important to pick the right core. This is particularly true on Sandy/Ivy Bridge motherboards.

10.1. View current socket buffer sizes: [root@test1 ~]# sysctl net.core.rmem_max [root@test1 ~]# sysctl net.core.wmem_max [root@test1 ~]# sysctl net.core.rmem_default [root@test1 ~]# sysctl net.core.wmem_default

10.2. Typical socket buffer sizes: [root@test1 ~]# vi /etc/sysctl.conf net.core.rmem_max = 536870912 #512MB net.core.wmem_max = 536870912 #512MB net.core.rmem_default = 536870912 #512MB net.core.wmem_default = 536870912 #512MB net.core.netdev_max_backlog = 40000

net.ipv4.tcp_rmem = 4096 33554432 536870912 # 4KB, 32MB, 512MB net.ipv4.tcp_wmem = 4096 33554432 536870912 # 4KB, 32MB, 512MB net.ipv4.tcp_mem = 33554432 33554432 33554432 # 32MB, 32MB, 32MB net.ipv4.tcp_mtu_probing = 1 net.ipv4.tcp_slow_start_after_idle = 0

net.ipv4.route.flush = 1

net.ipv4.conf.all.arp_filter = 1 net.ipv4.conf.all.rp_filter = 0 net.ipv4.neigh.default.gc_thresh1 = 8192 # 8MB net.ipv4.neigh.default.gc_thresh2 = 8192 # 8MB net.ipv4.neigh.default.gc_thresh3 = 8192 # 8MB net.ipv4.neigh.default.gc_stale_time = 3600

#Force IGMP version 2 net.ipv4.conf.all.force_igmp_version = 2 net.ipv4.conf.default.force_igmp_version = 2

#figure these ones out: net.ipv4.udp_mem = 12364032 16485376 24728064 ??? net.ipv4.udp_rmem_min = 4096 ??? net.ipv4.udp_wmem_min = 4096 ???

10.3 Make sure the interface port type is setup as eth (as opposed to ip):

```
[root@hpc_node02 users]# /sbin/connectx_port_config -s
--------------------------------
Port configuration for PCI device: 0000:03:00.0 is:
eth
--------------------------------
```

10.4 Run the interactive command below to set the port as eth: [root@hpc_node02 users]# /sbin/connectx_port_config

10.5. Check the resulting .conf file and copy its content to /opt/chroots/hpc_os_images//rhel-xes_mell/etc/infiniband/connectx.conf: [root@hn_server users]# vi /opt/chroots/hpc_os_images/rhel-xes_mell/etc/infiniband/connectx.conf #ConnectX Port Configuration for 0000:83:00.0 /sbin/connectx_port_config -d 0000:83:00.0 -c eth #ConnectX Port Configuration for 0000:03:00.0 /sbin/connectx_port_config -d 0000:03:00.0 -c eth #ConnectX Port Configuration for 0000:85:00.0 /sbin/connectx_port_config -d 0000:85:00.0 -c eth

10.6. Determine pci bus the card is on: dmesg | grep mlx OR ethtool -i eth3 11. Determine the NUMA node the PCI bus belongs to (if the value returned is -1; you have an UMA hardware platform and KERNEL actually emulates/fakes NUMA, or device is on a bus that does not have NUMA locality such as PCI-Brdige): view /sys/bus/pci/devices/0000:83:00/numa_node OR cat /sys/class/net/eth3/device/numa_node cat /sys/class/net/eth4/device/numa_node 12. Determine the CPU values for the next step (#1,#2,#3,#4) numactl -H | grep "node 1 cpus" numactl --show numactl -H 13. Collecting Network Statistics Data for both Blades (1 blade is shown): 13.1. Collect the Statistics on boot before running qperf: [root@hpc_node02 users]# egrep "CPU|eth3" /proc/interrupts > ~/report_onboot.txt [root@hpc_node02 users]# cat /proc/net/softnet_stat >> ~/report_onboot.txt [root@hpc_node02 users]# ethtool --statistics eth3 >> ~/report_onboot.txt [root@hpc_node02 users]# netstat --interfaces=eth3 >> ~/report_onboot.txt [root@hpc_node02 users]# netstat --statistics >> ~/report_onboot.txt [root@hpc_node02 users]# netstat --route >> ~/report_onboot.txt 13.2. Collect the Statistics with the irqbalance running: [root@hpc_node02 users]# watch -n 1 "egrep "CPU|eth3" /proc/interrupts" [root@hpc_node02 users]# watch -n 0.1 'cat /proc/net/softnet_stat' [root@hpc_node02 users]# watch -n 0.1 "ethtool --statistics eth3 | grep -E "tx_packets|tx_bytes|rx_packets|rx_bytes|errors|dropped|collisions|aggregated|flushed|stopped|timeout|fail|filtered|clean" " [root@hpc_node02 users]# watch -n 0.1 "netstat --interfaces=eth3; netstat --statistics" [root@hpc_node02 users]# watch -n 0.1 "netstat --interfaces=eth3; netstat --statistics"" [root@hpc_node02 users]# netstat --route 14. Bind the CPUs to and run the qperf: Rx: numactl --physcpubind=10 --membind=0 qperf numactl --physcpubind=24 --membind=1 qperf Tx: numactl --physcpubind=10,11,12,13 --membind=0 qperf -v -t 3 -ub -m 8972 blade01-eth3 udp_bw numactl --physcpubind=24,25,26,27 --membind=1 qperf -v -t 3 -ub -m 8972 blade01-eth4 udp_bw 15. Compare the two Statistics reports:
[root@hpc_node02 users]# sdiff ~/report_onboot.txt ~/report_irqbalance_running.txt

  1. Note: ifconfig eth3 | head --lines=2 ifconfig eth4 | head --lines=2 cat /sys/class/net/eth3/device/numa_node cat /sys/class/net/eth4/device/numa_node numactl -H

Network Performance Debugging

^ Ring Buffer (NIC HW)<=DA ^ TA=>QDISK (KERNEL)<=DA ^ TA=>Application Buffer (User Space) ^ | Section 1 | Section 2 | Section 3 |

  • Section 1: Ring Buffer (NIC HW)
    • echo ‘MTU=9000’ >> /etc/sysconfig/network-scripts/ifcfg-ethN
    • ethtool –g|G eth4 tx|rx-jumbo|rx-mini|rx N Show/Set Ring Buffer depth.
    • ethtool –l|L eth4 tx|rx N Show/Set Ring Buffer Channels. Where a channel is a set of queues with a single IRQ.
    • ethtool –a|A eth4 rx|tx on Configure Flow Control on the NIC to send PAUSE frames to the switch.
    • ethtool -c|C eth4 adaptive-rx off rx-usec 0 rx_frames 0 Configures Interrupt Coalescence: the amount of traffic that NIC must receive, or time passed after receiving traffic before issuing a Hard Interrupt.
    • ethtool –k|K eth4 NIC Offloads should be used on high speed systems that transmit or receive large amounts of data and favor throughput over latency.

flushed|aggregated|errors|collisions|timeout|offload|desc|filtered|jabbers" %%'%%```.

  • Section 2: QDISK (KERNEL)
    • /proc/sys/net/core/default_qdisc Qdisc=Pfifo_fast: deep queue and not flow aware
    • /proc/sys/net/core/dev_weight The max number of frames the NIC can receive before the SoftIRQ triggers CPU to empty the Qdisc (decrease to speedup the Qdisck emptying by CPU).
    • /proc/sys/net/core/netdev_budget The duration of the SoftIRQ in CPU time (For high speed NICs, increase so the CPU has more time to empty the Qdisc). Increase the budget if any of the columns besides the 1st one are increasing in /proc/net/softnet_stat watch –d –n 0.5 ‘cat /proc/net/softnet_stat’
    • /proc/sys/net/core/netdev_max_backlog Sets the maximum number of packets allowed to queue when a particular interface (NIC) receives packets faster than the kernel can process them.
    • /proc/sys/net/core/optmem_max Configures the maximum ancillary buffer size allowed per socket
    • Socket Receive Buffer Size in bytes. Crank up these values to increase the queue depth to avoid Buffer overruns: /proc/sys/net/core/rmem_default /proc/sys/net/core/rmem_max /proc/sys/net/core/wmem_default /proc/sys/net/core/wmem_max
    • watch –d –n 0.5 ‘cat /proc/net/snmp’ an increase in UDP input errors indicates that one or more socket receive queues are full when the network stack attempts to queue new frames into an application's socket.
    • Make sure interrupts are well balanced across CPU cores: watch –d –n 0.5 ‘cat /proc/interrupts’ watch –d –n 0.5 ‘netstat –su’ #statisticsUdp
    • watch -d -n 0.3 'netstat –neopau’ #numericExtendedTimersProgramAllUdp
      • Recv-Q the count of bytes yet to be copied by the user’s application from the buffer.
      • Send-Q the count of bytes yet to be acknowledged by the remote host.
    • dropwatch –l kas -> start Monitors the packets that are dropped by KERNEL
  • Section 3: Application Buffer (User Space)
    • To avoid Qdisc overruns: in your program, increase the rate at which the recv() or read() calls are performed to empty the Qdisc more rapidly.
    • CPU Performance: x86_energy_perf_policy –r x86_energy_perf_policy performance
    • numactl --physcpubind=10,11,12,13 --membind=0 <your application>
    • turbostat -interval 1 --debug CPU Monitoring.
    • NUMA node statistics: watch -d -n 0.5 numastat -n numa_miss #BAD if incrementing numa_foreign #BAD if incrementing watch -d -n 0.5 numastat -m
Red Hat Enterprise Linux Network Performance Tuning Guide

Source: "Red Hat Enterprise Linux Network Performance Tuning Guide.pdf"

ethtool (Networking Tools):

  1. Displays NIC settings:

    #Driver Information:
    ethtool -i ethX
    #Number of Queues dynamically allocated:
    ethtool -l ethX
    #Current Ring Buffer parameters for ethN: 
    ethtool -g ethN
    #statistics:
    ethtool [-S|--statistics] ethX # | tee grep err | tee grep dropped | tee grep flushed | grep aggregated | tee grep filtered
      tx_packets: trasmitted packets
      rx_packets: received packets
      tx_errors: trasmission errors
      rx_errors: received errors
      rx_missed: recieved misses
      align_errors: received alignment errors
      tx_single_collisions: trasmitted singular collisions
      tx_multi_collisions: trasmitted multiple collisions
      unicast: received unicast
      broadcast: received broadcast
      multicast: recieved multicast
      tx_aborted: aborted trasmissions
      tx_underrun: aborted underruns
    #make it blink for N seconds:
    ethtool [-p|--identify] ethX [N] 
    #test hw online (nvram, link test), or offline (register, memory loobpack, interrupt):
    ethtool [-t|--test] ethX [offline|online|external_lb] 
    
    
  2. Set NIC settings:

    #set parameter:
    ethtool [-s|--change] ethX param=PARAM
    ethtool [-s] [--offload|speed|duplex|autoneg|phyad]
    ethtool [-s] port
    ethtool [-s] [--xcvr] [internal|external] 
    #set Ring Buffer parameters for ethN
    ethtool -G|--set-ring ethN [rx N] [rx-mini N] [rx-jumbo N] [tx N]
    
    

ip (Networking Tools):

  1. Manages and monitors Routes, Devices, Policy Routing, and Tunnels.
  2. Uses Kernel's Netlink Interface and supersede ifconfig which uses old-style IOCTL.

netstat (Networking Tools):

  1. Provides information about Open Network Connections (like ifconfig) and Protocol Stack Statistics:
  2. Retrieves info from these files: /proc/net/dev /proc/net/tcp /proc/net/unix

netstat [-neopa | --numeric --extend --timer --programs --all] netstat [-i | --interfaces] | [--interfaces=eth0] netstat {-s | --statistics] netstat [-r | --route] netstat [-M | --masquerage]

dropwatch (Networking Tools):

  1. Monitors packets freed from memory by kernel

/proc/net/snmp (Networking Tools):

  1. ASCII file containing data for IP, ICMP, TCP and UDP management information for snmp agent.
  2. Displays real-time UDP-lie statistics.

sysctl (Networking Tools):

  1. Used to read/write run-time configuration parameters from */proc/sys/ ** using next syntax: sysctl net.ipv4.tcp_sack #reading from /proc/sys/net/ipv4/tcp_sack sysctl -w net.ipv4.tcp_sack=0 #writing to /proc/sys/net/ipv4/tcp_sack Alteratively, you can use: echo "0" > /proc/sys/net/ipv4/tcp_sack
  2. Place you sysctl scripts into the /etc/sysctl.conf file to make the setting persistent. echo "sysctl -w net.ipv4.tcp_sack=0" >> /etc/sysctl.conf

Identifying the Bottleneck (aka Overflow; Gathering Statistics Information)

  1. Packet drops and overruns typically occur when the RX buffer on the NIC card cannot be drained fast enough by the kernel.
  2. If Kernel drains the packets at slower rate than the network provides them, the NIC discards incoming packets and increment a Discard Counter.
  3. The Hardware Interrupts and SoftIRQs respond to Hardware Interrupt and to Receive Traffic, and poll the NIC for traffic.
  4. The net.core.netdev_budget value determines the IRQ poll duration.
  5. Using ethtool look for counter names like fail, miss, error,discard, buf, fifo, full, or drop. (consult Hardware Documentation) ethtool -S eth4 #Check Counter statistics at adapter firmware level rx_errors: 0 tx_errors: 0 rx_dropped: 0 tx_dropped: 0 rx_length_errors: 0 rx_over_errors: 3295 rx_crc_errors: 0 rx_frame_errors: 0 rx_fifo_errors: 3295 rx_missed_errors: 3295 egrep "CPU0|eth3" /proc/interrupts # make sure interrupts are spread across CPUs; see man irqbalance cat /proc/net/softnet_stat # all columns except 1st should have 0s; double the net.core.netdev_budget value if not. CPU0 CPU1 105: 1430000 0 IR-PCI-MSI-edge eth2-rx-0 106: 1200000 0 IR-PCI-MSI-edge eth2-rx-1 107: 1399999 0 IR-PCI-MSI-edge eth2-rx-2 108: 1350000 0 IR-PCI-MSI-edge eth2-rx-3 109: 80000 0 IR-PCI-MSI-edge eth2-tx netstat -s #for protocol layers IP, TCP, or UDP
  6. If the data is not drained fast enough from the socket buffer queue, RX Queue will get full. Use ss -nmp command to see if the RX Queues are overflown (mem:(r0,w0,f4096,t0)) ss -nmp State Recv-Q Send-Q Local Address:Port Peer Address:Port ESTAB 0 0 192.168.10.81:22 192.168.10.14:50483 mem:(r0,w0,f0,t0) ESTAB 0 0 192.168.10.81:22 192.168.10.14:50489 mem:(r0,w0,f4096,t0) ESTAB 0 0 192.168.10.81:845 192.168.10.14:2049

Brainstorming Possible Issues####

  1. SoftIRQs may not be getting enough CPU time, use sar, mpstat, or top to determine what consumes CPU runtime.
  2. The Data may not be drained fast enough from the Rx socket buffer, use: ss -nmp #to look for full Rx Queues netstat -s #to look for Buffer Pruning Errors and UDP Errors (-su option). ethtool -G eth4 rx 8192 tx 4096 #to increase the size of the Rx and Tx Buffers
  3. The Speed (lspci | grep Speed) and the Width (lspci | grep Width) of the PCI Bus may not be large enough for the data to be transmitted from NIC to CPU; use Mellanox Procedure above to see determine if that's the case.
  4. If the number of Data Sources is low, increasing the packet size will reduce header/trailer overhead; thus improving the network performance.

Interrupt Coalescence (IC):

Interrupt coalescence refers to the amount of traffic that a network interface will receive, or time that passes after receiving traffic, before issuing a hard interrupt: ethtool -c eth4 #to check NIC's interrupt coalescence ethtool -C eth4 adaptive-rx off rx-usecs 0 rx-frames 0 #to tune NIC's interrupt coalescence 5. Increase the number of TCP Streams. Using more streams is more efficient at transferring data: netstat -neopa #shows the number of connections an application is using.

Adapter Queue

  1. Check the statistics on each CPU core's SoftIRQ: cat /proc/net/softnet_stat #The output for the command above #Data in each row corresponds to a single CPU #1st col num of frames received by the interrupt #2nd frames dropped ; net.core.netdev_max_backlog #3rd times ksoftirqd ran out of netdev_budget; net.core.netdev_budget

wc -l /proc/net/softnet_stat #count the num of cpu cores 2. If the third column has non zero values, double the current IRQ budget value: sysctl net.core.netdev_budget net.core.netdev_budget = 300 sysctl -w net.core.netdev_budget=600 OR echo VALUE > /proc/sys/net/core/netdev_max_backlog 3. If the second column has non zero value while the corresponding third column value is a zero, adjust the buffer depth and increase the net.core.netdev_max_backlog: sysctl net.core.netdev_max_backlog net.core.netdev_max_backlog = 1000 sysctl -w net.core.netdev_max_backlog=1500

ethtool -g Ring parameters for eth3: Pre-set maximums: RX: 8192 RX Mini: 0 RX Jumbo: 0 TX: 8192 Current hardware settings: RX: 1024 RX Mini: 0 RX Jumbo: 0 TX: 512 ethtool -G eth4 rx 2048tx 1024

Adapter RX/TX Buffer Tuning

Often increasing the Receive Buffer size helps to prevent packet drops ethtool -g eth4 Ring parameters for eth3: Pre-set maximums: RX: 8192 RX Mini: 0 RX Jumbo: 0 TX: 8192 Current hardware settings: RX: 1024 RX Mini: 0 RX Jumbo: 0 TX: 512 ethtool -G eth4 rx 8192 tx 4096 #Use /sbin/ifup-local to make commands persistent.

Adaptive Transmit Queue Length

ip link #to see the qlen value ip -s link #to if there are any drops on the TX Queue for an adapter ip link set dev eth4 txqueuelen VALUE #to set the qlen value #a udev rule can be written to apply the qlen to the interface as it's created. #or the network scripts can be extended with a script in /sbin/ifup-local

Module Parameters

dmesg | grep 'Ethernet driver' modinfo mlx4_en ls -la /sys/module/mlx4_en/parameters/ # to see avaialable parameters cat /sys/module/mlx4_en/parameters/PARAM_NAME # to see current value echo VALUE > /sys/module/mlx4_en/parameters/PARAM_NAME # to set a new value modprobe -r mlx4_en; modprobe mlx4_en # to reload the driver once value was changed (may need reboot) modprobe -r mlx4_en; modprobe mlx4_en = # for non-persistent use.

#Note Driver parameters can be set via ethtool when module option is not available: ethtool -C eth4 rx-usecs 1000 #already discussed above

Adapter Offloading

  1. To reduce CPU load from the system, modern network adapters have offloading features which move some network processing load onto the network interface card.
  2. The kernel can submit large (up to 64k) TCP segments to the NIC, which the NIC will then break down into MTU-sized segments.
  3. Offloading features are often enabled by default.
  4. The offloading parameter tuning is performed by turning these features off and observing The offloading parameter tuning is performed by turning a feature off and observing a network performance improvement. If the network performance improved after switching a feature off, report the find to Red Hat support.
  5. Common Offloading Features: GRO: Generic Receive Offload
    LRO: Large Receive Offload
    TSO: TCP Segmentation Offload
    RX check-summing: Processing of receive data integrity
    **TX check-summing:**Processing of transmit data integrity (required for TSO) \

UDP Buffer Tuning

  1. Less complex protocol than TCP.
  2. contains no session reliability.
  3. Application is responsible to identify and re-transmi dropped packets.
  4. Does not use window size concept.
  5. Lost data is not recovered by the UDP protocol.
  6. Increasing the Receiver Buffer size is the only available tuning comprises.
  7. Use netstat -us to list the errors.
  8. If netstat -us indicates packet receive errors, increase the Receiver Buffer size: 8.1 The increase in packet receive errors statistic also may be due to:
  • short packet size (the data payload is less that the UDP header advises.
  • corrupted packets that fail their checksum calculation 8.2. Next command is used to check the buffer size: sysctl net.core.rmem_max 124928 8.3 Next command is used to set the buffer size: sysctl -w net.core.rmem_max=16777216 8.4 Restart the application for the new settings to take effect:

NUMA Architecture

  1. Splits a subset of CPU, memory and devices into different nodes.
  2. Each node is similar to a small computer with a fast interconnect, but all nodes share a common operating system.
  3. Tuning a NUMA system differs from tuning a non-NUMA system.
  4. On a NUMA system, for all devices belonging to a single NUMA node, the devices' interrupts must be grouped to belong to the CPU cores of that NUMA node (In other words, you don't want the hardware on node1 to set interrupts to CPUs on node0, or node2; the must be set the the CPUs that belong to the same node as the hardware that's setting the interrupts!).
  5. irqbalance is NUMA-aware allowing interrupts to balance only the CPUs within a given NUMA node.

Determine NUMA nodes

  1. Determine the number of NUMA nodes a system has: ls -ld /sys/devices/system/node/node* drwxr-xr-x. 3 root root 0 Aug 15 19:44 /sys/devices/system/node/node0 drwxr-xr-x. 3 root root 0 Aug 15 19:44 /sys/devices/system/node/node1
  2. Determine NUMA locality (which CPU belongs to what NUMA node): cat /sys/devices/system/node/node0/cpulist #has 6 CPUs since returned 0-5 0-5 cat /sys/devices/system/node/node1/cpulist #has no CPUs since nothing is returned
  3. Determine the Device Locality (which NUMA node the device belongs to): cat /sys/class/net//device/numa_node cat /sys/class/net/eth0/device/numa_node 1 3.1 if the command above returns -1, the hardware platform is actually uniform and KERNEL is emulating NUMA, or the device is on bus which does not have any NUMA locality such as PCI bridge.
  4. To check the number of RX and TX queues on the adapter: egrep "CPU0|eth3" /proc/interrupts 4.1 The number of queues can sometimes be dynamically allocated using ethtool -L command (normally queues are allocated when NIC driver module is loaded). 4.2 The network card statistics can be listed using: ethtool -S eth0
  5. To manually balance CPU affinity: 3.1. Stop the irgbalance service from automatically balancing the CPU affinity: service stop irqbalance stop 3.2 Disable the irqbalance service: chkconfig irqbalance off 3.2 Use vendors scripts/procedure to manually balance the IRQs

ethtool -k eth4 Features for eth0: rx-checksumming: on tx-checksumming: on scatter-gather: on tcp-segmentation-offload: on udp-fragmentation-offload: off generic-segmentation-offload: on generic-receive-offload: on large-receive-offload: on rx-vlan-offload: on tx-vlan-offload: on ntuple-filters: off receive-hashing: on ethtool -K eth4

How to Clear/Reset ethtool Statistics

[root@hpc_node02 ~]# ethtool -i eth3 driver: mlx4_en version: 3.4-1.0.0 (25 Sep 2016) firmware-version: 2.40.5000 bus-info: 0000:03:00.0 supports-statistics: yes supports-test: yes supports-eeprom-access: no supports-register-dump: no supports-priv-flags: yes

[root@hpc_node02 ~]# /etc/init.d/network restart [root@hpc_node02 ~]# modprobe -r mlx4_en [root@hpc_node02 ~]# modprobe mlx4_en [root@hpc_node02 ~]# /etc/init.d/network start

Queueing in the Linux Network Stack

Source: https://www.coverfire.com/articles/queueing-in-the-linux-network-stack/

Packet Queues: \

  1. allow for asynchronous modules to communicate.
  2. increase performance.
  3. have the side affect of impacting latency.

Driver Queue (aka Ring Buffer): \

  1. Driver Queue is a FIFO ring-buffer (think of it as a fixed size buffer) which treats all packets equally and has no capabilities for distinguishing between packets of different flows.
  2. Does not contain packet data. Instead, consists of descriptors which point to other data structures called Socket Kernel Buffers (SKB; SKBs hold the packet data and are used throughout the Kernel).
  3. The Input Source to the Driver Queue is the IP Stack.
  4. Ensure that whenever the system has data to transmit, the data is available to the NIC for immediate transmission.
  5. Gives the IP stack a location to queue data asynchronously from the operation of the hardware.
  6. Note: One alternative design would be for the NIC to ask the IP stack for data whenever the physical medium is ready to transmit. Since responding to this request cannot be instantaneous, this design wastes valuable transmission opportunities resulting in lower throughput. The opposite approach would be for the IP stack to wait after a packet is created until the hardware is ready to transmit. This is also not ideal because the IP stack cannot move on to other work.

IP Stack: \

  1. Queues complete IP packets.
  2. The packets may be generated locally.
  3. The packets may be received on one NIC when the device is functioning as an IP router.

Hardware Driver: \

  1. Dequeues the packets from Driver Queue and sends them across the data bus to the NIC Hardware for the transmission.

MTU (Maximum Transmission Unit): \

  1. Indicate the biggest frame which can be transmitted by the physical media.
  2. Supports JUMBO frames up to 9,000 bytes.
  3. Determines a number of packets created by IP Stack. If a 2,000 byte packet has to be transmitted while MTU is set to 1,500. A total number of 2 packets will be created and sent (As a result, Latency is introduced due to packet header and footer).
  4. When MTU is too small, each packet inherits an overhead since each packet has to have a packet header and a packet footer, in addition to packet data.

Linux Kernel MTU Optimization: \

  1. Linux has transmit side optimizations which reduce per-packet overhead.
  2. TSO: TCP Segment Offload.
  3. UFO: UDP Fragmentation Offload.
  4. GSO: Generic Segmentation Offload.
  5. All of the optimizations above allow the IP Stack to create packets that are larger than the MTU of the outgoing NIC.
  6. For IPV4, a packet of maximum size of 65,536 bytes can be created and queued to the Driver Queue.
  7. For TSO and UFO, the NIC hardware breaks the single large packet into packets small enough to be transmitted on the physical hardware.
  8. For the NICs without hardware support , GSO breaks the single large packet into smaller packets in software immediately before queueing the Driver Queue.
  9. Since TSO, UFO and GSO allow for much larger packets, these optimizations have the side effect of greatly increasing the number of bytes that can be queued in the driver queue (introducing Bufferbloat).

Receive Path: \

  1. Linux also has receive side optimizations which reduce per-packet overhead.
  2. GRO: Generic Receive Offload; allows the NIC driver to combine received packets into a single large packet which is then passed to the IP Stack.
  3. When forwarding packets, GRO allows for the original packets to be reconstructed which is necessary to maintain the end-to-end nature of IP packets.
  4. When the large packet is broken up on the transmit side of the forwarding operation, it results in several packets for the flow being queued at once. This micro-burst of packets can negatively impact inter-flow latency.

Starvation (too small buffer) and Latancy (too large buffer): \ The queue between the IP Stack and the Hardware introduce two problems: \

  1. Starvation: \ When the NIC driver wakes to pull packets off of the queue for transmission and the queue looks empty due to asynchronicity between the IP Stack and the NIC's Hardware Driver, the hardware will miss a transmission opportunity thereby reducing the throughput of the system. Note that an empty queue when the SYSTEM does not have anything to transmit is not starvation – this is normal. On a busy system the IP stack will get fewer opportunities to add packets to the buffer which increases the chances that the hardware will drain the buffer before more packets are queued (resulting in Starvation). It is advantageous to have a very large buffer to reduce the probability of starvation and ensures high throughput (but may introduce Latency).

  2. Latency: \ Latency often occurs when the driver queue is filled with data of different size corresponding to 2 different sources. If the head of the queue is filled with Large Segments from a first source and at the tail of the queue we have a Small Segment corresponding to the seconds source. The queue has to be emptied from the Larger segments of the first source first; thus introducing a large delay to the transmission of the Small Segment of the seconds source. The smaller the amount of queued data, the lower the maximum latency experienced by queued packets. Ex: A 5 Mbit/sec NIC transmits a bulk pakcet with each packet of 1,500 bytes and an interactive packet of 500 bytes. The queue depth is 128 descriptors where 127 descriptors are the bulk ones and 1 is interactive one. Time required to drain 127 bulk packets and to create an opportunity for the interactive packet to be transmitted: 127*12,000/5,000,000 = 304 ms (which is our resultant latency). Thus a Bufferbloat of 304 ms is introduced into system by the oversized, unmanaged buffer.

BQL: Byte Queue Limits reduces network Latency by limiting the amount of queued data to the minimum required to avoid starvation: \

  1. attempts to solve the problem of Driver Queue sizing automatically.
  2. Adds a layer which enables and disables queuing to the Driver Queue by calculating the minimum buffer size required to avoid starvation under the current system conditions.
  3. The actual size of the driver queue is not changed by BQL. Instead, BQL calculates a limit of how much data (in bytes) can be queued at the current time. Any bytes over this limit must be held or dropped by the layers above the driver queue.
  4. BQL mechanism calculates the LIMIT value under next 2 conditions:
  1. the packets are queued to the Driver Queue.
  2. a transmission to the wire has completed.
  1. LIMIT is the calculated value the BQL used as a Threshold Value of the Driver Queue.
  2. BQL mechanism 1st stage*: After adding packets to the queue, if the number of queued bytes is over the current LIMIT value then, disable the queuing of more data to the driver queue.
  3. The amount of queued data can exceed LIMIT because data is queued before the LIMIT check occurs.
  4. Enabling the TSO, UFO or GSO can cause a too large number of bytes being queued in a single operation allowing a higher than desirable amount of data to be queued (exceeding the LIMIT value).
  5. The TSO, UFO and GSO can be disabled to improve Latency Issue.
  6. BQL operates on bytes basis (not packet or descriptors).
  7. BQL mechanism 2nd stage (When the hardware has completed sending a batch of packets (Referred to as the end of an interval): If the hardware was starved in the interval: increase the LIMIT value. Else if the hardware was busy during the entire interval (not starved) and there are bytes to transmit: decrease LIMIT by the number of bytes not transmitted in the interval. If the number of queued bytes is less than the LIMIT: enable the queuing of more data to the buffer. Ex: driver queue size = 256 descriptors, MTU = 1500 bytes meaning 256*1500=384000 bytes; however the BQL has calculated a limit of 3012 bytes (thus reducing 384000 bytes down to 3012 bytes). bytes, packets, descriptors.

QDisc: Queueing Discimplines: \

  1. Sandwiched between the IP Stack and the Driver Queue is the Queuing Discipline (QDisc) layer.
  2. Implements the traffic management capabilities of the Linux kernel which include: Traffic Classification, Prioritization, and Rate Shaping.
  3. There are 3 key concepts to understand in the QDisc layer: QDiscs, Classes, and Filters.
  1. QDisc: the Linux abstraction for traffic queues which are more complex than the standard FIFO queue. Allows the QDisc to carry out complex queue management behaviors without requiring the IP Stack or the NIC Driver to be modified. By default every network interface is assigned a pfifo_fast QDisc which implements a simple three band prioritization scheme based on the TOS bits (see IP Stack fields). The pfifo_fast QDisc is far from the best choice because it defaults to having very deep queues (see txqueuelen below) and is not flow aware.
  2. Class: Only classful QDiscs have support for multiple classes. Closely related to the QDisc. QDiscs may implement classes in order to handle subsets of the traffic differently. Ex: Hierarchical Token Bucket (HTB) QDisc allows the user to configure 500Kbps and 300Kbps classes and direct traffic to each as desired.
  3. Filters: the mechanism used to classify traffic to a particular QDisc or class. u32 is the most general flow filter and is the easiest to use.

TCP Small Queue: \

  1. The Latency problem due to the two sources transmitting a high packet rate flow and a small packet as previously described can occur in Queuing Discipline, and a standing queue can form (Increased Latency, problems in TCP's RTT, and congestion window size calculations).
  2. The TCP Small Queues address the problem above!
  3. Another way to solve the problem described is to use QDisc with many queues (ideally 1 per network flow). The Stochastic Fairness Queueing (SFQ) QDisc, and Fair Queueing with Controlled Delay (fq_codel) QDisc have a queue per network flow!

MANIPULATING QUEUE SIZES IN LINUX

Driver Queue: \

  1. ethtool command:

    controls the driver queue size for eth devicesprovides.
    provides low level interface statistics.
    provides ability to enable/disable **Ip Stack**.
    provides ability to enable/disable **Driver** features.
    
  2. Use ethtool -g eth0 to display the driver queue (ring) parameters: \

    [root@host net-next]# ethtool -g eth0
    Ring parameters for eth0:
    Pre-set maximums:
    RX:        16384
    RX Mini:    0
    RX Jumbo:    0
    TX:        16384
    Current hardware settings:
    RX:        512
    RX Mini:    0
    RX Jumbo:    0
    TX:        256
    

where TX: 256 corresponds to 256 descriptors in the transmission queue and TX: 256 corresponds to the 512 descriptors in the receive queue. Note: it's often recommended to reduce the size of the Driver Queue in order to reduce latency. With the introduction of BQL (assuming your NIC driver supports it) there is no longer any reason to modify the driver queue size (see the below for how to configure BQL).

  1. Use ethtool -k eth0 to display the current offload settings (TSO, UFO, GSO), and "ethtool -K eth0" to modify them:

    [user@host ~]$ ethtool -k eth0
    Offload parameters for eth0:
    rx-checksumming: off
    tx-checksumming: off
    scatter-gather: off
    tcp-segmentation-offload: off
    udp-fragmentation-offload: off
    generic-segmentation-offload: off
    generic-receive-offload: on
    large-receive-offload: off
    rx-vlan-offload: off
    tx-vlan-offload: off
    ntuple-filters: off
    receive-hashing: off
    

Note: To optimize for Latency or throughput decrease, disable the TSO, GSO, UFO and GRO optimization.

BQL Byte Queue Limits: self tuning and does not require modifications: \

  1. If concerned about optimal Latencies at low bit-rate, you can override the upper limit on the calculate LIMIT value using files found in /sys/devices/pci0000:00/0000:00:14.0/net/eth0/queues/tx-0/byte_queue_limits hold_time: Time between modifying LIMIT in milliseconds. inflight: The number of queued but not yet transmitted bytes. limit: The LIMIT value calculated by BQL. 0 if BQL is not supported in the NIC driver. limit_max: A configurable maximum value for LIMIT. Set this value lower to optimize for latency. limit_min: A configurable minimum value for LIMIT. Set this value higher to optimize for throughput.

    Note: set the value by doing: echo "VALUE" > file_name

Adjusting the Transmission Queue Length: the txqueuelen parameter refers to the higher level QDisc layer. \

  1. To address the problem of bufferbloat (large and small data sources), you can reduce the NIC Transmission Queue Legnth statically using:

    [user@host ~]$ ifconfig eth0
    [user@host ~]$ ifconfig eth0 tfquelen VALUE
    
  2. The txqueuelen is only used as a default queue length for some of the queuing disciplines. Specifically:

    pfifo_fast (Linux default queueing discipline)
    sch_fifo
    sch_gred
    sch_htb (only for the default queue)
    sch_plug
    sch_sfb
    sch_teql
    Note: if you do not use one of the above **queuing disciplines**, or if you override the **queue length** then the **txqueuelen** value is meaningless.
    

Queueing Disciplines: \

  1. See man tc for full details on configuring QDiscs.
  2. To find details for each QDisc, use man tc Qdisc-name (ex: mat tc htb or man tc fq_codel)
  3. tc command tricks and tips:
  1. HTB QDisc implements a default queue which receives all packets if they are not classified with filter rules. Use direct_packets_stat in tc qdisc show The HTB class hierarchy is only useful for classification not bandwidth allocation. All bandwidth allocation occurs by looking at the leaves and their associated priorities
  2. DRR simply black hole traffic that is not classified.
  3. The QDisc infrastructure identifies QDiscs and classes with major and minor numbers which are separated by a colon. The major number is the QDisc identifier. The minor number identifies the class within that QDisc.
  4. If you are using ADSL which is ATM (most DSL services are ATM based but newer variants such as VDSL2 are not always) based you probably want to add the “linklayer adsl” option. This accounts for the overhead which comes from breaking IP packets into a bunch of 53-byte ATM cells.
  5. If you are using PPPoE then you probably want to account for the PPPoE overhead with the overhead parameter.

TCP SMALL QUEUES

  1. The per-socket TCP queue limit can be viewed and controlled with the following /proc file: \

    [user@host ~]$ cat /proc/sys/net/ipv4/tcp_limit_output_bytes
    you should not need to modify this value in any normal situation. 
    
Useful Commands and Files

Some useful commands:

```
netstat --route #same as route -en
ip maddr
  maddr add
  maddr del
ip route
  route add
  route delete
  route replace
  route get
ip neigh
  neigh add
  neigh del
  neigh replace
arping -I <your eth dev> <neighbor's ip_addr>
      -D -I <your eth dev> <neighbor's ip_addr>
ss -a
  -e
  -o
  -n
  -p
route -en
      -eF
      -eC
      -nNveeF
      -nNveeC
      -del -net default
      -add -net 192.168.10.0/24 gw 192.168.10.14

ifconfig
ifup
ifdown
ethtool
ip
netstat -n, netstat -t, netstat -u, netstat -l, netstat -a
route -n
iptables
rpcinfo -p locahost
tc
arp -n, arp -d,host, arp -i eth0
traceroute
dhclient eth0
sysctl
telnet www.address.com 80
lsof -i
tcpdump, tcpdump tcp, tcpdump udp, tcpdump port 80, tcpdump host hostname, tcpdump net eth0
netcat
nmap ip_address
host www.host_domain_name.com

ethtool -G eth0 rx 4096 tx 2048
lspci -s 83:00.0 -vvv | grep Width # determine pci lanes (Mellanox supports x8 and x16)
lspci -s 83:00.0 -vvv | grep Speed # determine Speed in GT/s (Gen1=2.5GT/s, Gen2=5GT/s, and Gen3=8GT/s 
                                  # where GT/s stands for Billion Transaction per Second)
lspci -s 83:00.0 -vvv | grep "PCIe Gen" # determine Generations
lspci -s 83:00.0 -vvv | grep DevCtl: -C 2 # used to view PCIe Max Payload Size which determines the maximal size of a PCIe packet
```

Some useful files:

```
/etc/hosts
/etc/resolv.conf
/etc/nsswitch.conf
/etc/services
```
TCP vs UDP Internetl Protocol (IP) traffic######

(Source: http://www.diffen.com/difference/TCP_vs_UDP) ^ feature ^ TCP ^ UDP ^ | Acronym for | Transmission Control Protocol | Universal/User Datagram Protocol | | Connection | TCP is a connection-oriented protocol | UDP is a connectionless protocol. | | Function | As a message makes its way across the internet from one computer to another. This is connection based. | UDP is also a protocol used in message transport or transfer. This is not connection based which means that one program can send a load of packets to another and that would be the end of the relationship. | | Usage | TCP is suited for applications that require high reliability, and transmission time is relatively less critical. | UDP is suitable for applications that need fast, efficient transmission, such as games. UDP's stateless nature is also useful for servers that answer small queries from huge numbers of clients | | Use by these protocols | HTTP, HTTPs, FTP, SMTP, Telnet | DNS, DHCP, TFTP, SNMP, RIP, VOIP | | Ordering of data packets | TCP rearranges data packets in the order specified. | UDP has no inherent order as all packets are independent of each other. If ordering is required, it has to be managed by the application layer. | | Speed of transfer | The speed for TCP is slower than UDP. | UDP is faster because error recovery is not attempted. It is a "best effort" protocol. | | Reliability | There is absolute guarantee that the data transferred remains intact and arrives in the same order in which it was sent. | There is no guarantee that the messages or packets sent would reach at all. | | Header Size | TCP header size is 20 bytes | UDP Header size is 8 bytes | | Common Header Fields | Source port, Destination port, Check Sum | Source port, Destination port, Check Sum | | Streaming of data | Data is read as a byte stream, no distinguishing indications are transmitted to signal message (segment) boundaries | Packets are sent individually and are checked for integrity only if they arrive. Packets have definite boundaries which are honored upon receipt, meaning a read operation at the receiver socket will yield an entire message as it was originally sent. | | Weight | TCP is heavy-weight. TCP requires three packets to set up a socket connection, before any user data can be sent. TCP handles reliability and congestion control. | UDP is lightweight. There is no ordering of messages, no tracking connections, etc. It is a small transport layer designed on top of IP. | | Data Flow Control | TCP does Flow Control. TCP requires three packets to set up a socket connection, before any user data can be sent. TCP handles reliability and congestion control. | UDP does not have an option for flow control | | Error Checking | TCP does error checking and error recovery. Erroneous packets are retransmitted from the source to the destination | UDP does error checking but simply discards erroneous packets. Error recovery is not attempted | | Fields | 1. Sequence Number, 2. AcK number, 3. Data offset, 4. Reserved, 5. Control bit, 6. Window, 7. Urgent Pointer 8. Options, 9. Padding, 10. Check Sum, 11. Source port, 12. Destination port | 1. Length, 2. Source port, 3. Destination port, 4. Check Sum | | Acknowledgement | Acknowledgement segments | No Acknowledgment | | Handshake | SYN, SYN-ACK, ACK | No handshake (connectionless protocol) |

High Level Network Therory###### =
  • Type cat /etc/udev/rules.d/70-persistent-net.rules to determine the MAC Address of your hardware:
#This file was automatically generated by the /lib/udev/write_net_rules #program, run by the persistent-net-generator.rules rules file. # #You can modify it, as long as you keep each rule on a single #line, and change only the value of the NAME= key.

#PCI device 0x8086:0x1502 (e1000e) SUBSYSTEM=="net", ACTION=="add", DRIVERS=="?", ATTR{address}=="6c:3b:e5:2d:31:cc", ATTR{type}=="1", KERNEL=="eth", NAME="eth0"

#PCI device 0x8086:0x1502 (e1000e) SUBSYSTEM=="net", ACTION=="add", DRIVERS=="?", ATTR{address}=="10:60:4b:88:93:5a", ATTR{type}=="1", KERNEL=="eth", NAME="eth1"

#PCI device 0x8086:0x1502 (e1000e) (custom name provided by external tool) SUBSYSTEM=="net", ACTION=="add", DRIVERS=="?", ATTR{address}=="10:60:4b:88:93:5a", ATTR{type}=="1", KERNEL=="eth", NAME="eth0"

#PCI device 0x1425:0x5411 (cxgb4) SUBSYSTEM=="net", ACTION=="add", DRIVERS=="?", ATTR{address}=="00:07:43:2f:0a:60", ATTR{type}=="1", KERNEL=="eth", NAME="eth2"

#PCI device 0x1425:0x5411 (cxgb4) SUBSYSTEM=="net", ACTION=="add", DRIVERS=="?", ATTR{address}=="00:07:43:2f:0a:68", ATTR{type}=="1", KERNEL=="eth", NAME="eth3"

*Type vi /etc/sysconfig/network-scripts/ifcfg-eth0 to change the following information: DEVICE=eth0 TYPE=Ethernet UUID=d6f8eb0e-9bd4-4dac-8ac2-822bc724870e ONBOOT=yes NM_CONTROLLED=yes BOOTPROTO=none HWADDR=10:60:4B:88:93:5A IPADDR=129.218.8.46 NETMASK=255.255.255.0 GATEWAY=129.218.8.1 #DNS1=129.204.125.250 DEFROUTE=yes IPV4_FAILURE_FATAL=yes IPV6INIT=no NAME=eth0

*Type vi /etc/sysconfig/network to change the HOSTNAME of your machine: HOSTNAME=mtn2ua30718qd NETWORKING=yes NISDOMAIN=gesd NTPSERVERARGS=iburst

*Other useful commands: ifconfig \shows currently running interfaces ifconfig -a \ shows all interfaces including the ones that are running and not ifup eth0 \ starts eth0 ifdown eth0 \ stops eth0 from runnig

ypcat hosts | grep serling // when setting up NIS using SYSTEM->ADMINISTRATION->AUTHENTICATION->NIS

NETWORK THEORY (To be completed)

computer.howstuffworks.com:

IP: Interface Protocol.

Static IP Address: *The one that you configure yourself by editing your computer's network settings. *Can create network issues if you use it without a good understanding of TCP/IP. *to assure that the interface of a given machine has the IP Address associated with its internet interface, the provided IP Address is "attached" to the machines Media Access Control Address (MAC Address).

Dynamic IP Address: *DHCP: Dynamic Host Configuration Protocol. *assigned by the DHCP, a service running on the network. *DHCP typically runs on network hardware such as routers or dedicated DHCP servers. *The IP address is only active for a limited time (called lease) and the computer will automatically request a new lease. *Dynamic IP Acquisition Process. When you add a computer to a network, that computer uses a four-step process to get an IP address from DHCP: * Discover -- The computer sends out a broadcast message on the network, hoping to discover a DHCP service provider. * Offer -- Each DHCP provider hears the message, recognizes the unique hardware address of the computer, and sends a message back offering its services to that computer. * Request -- The computer selects a DHCP provider from its offerings and then sends a request to that provider asking for an IP address assignment. * Acknowledge -- The targeted DHCP provider acknowledges the request and issues an IP address to the computer that doesn't match any other IP addresses currently active on the network.

IPV6 has 8 16-bit numbers separated by colons. Example: 2001:0cb8:85a3:0000:0000:8a2e:0370:7334

IPV4: consists of 4 8-bit fields which means that each field ranges from 0 through 255 since 2^8=256. The Internet Assigned Numbers Authority (IANA) IP reservations: *0.0.0.0 -- This represents the default network, which is the abstract concept of just being connected to a TCP/IP network. *255.255.255.255 -- This address is reserved for network broadcasts, or messages that should go to all computers on the network. *127.0.0.1 -- This is called the loopback address, meaning your computer's way of identifying itself, whether or not it has an assigned IP address. *169.254.0.1 to 169.254.255.254 -- This is the Automatic Private IP Addressing (APIPA) range of addresses assigned automatically when a computer fail to get assigned an address from a DHCP server. *IANA has set aside specific ranges of IP addresses for use as non-routable, internal network addresses which are considered to be unregistered IPs. Since no company/agency can claim ownership of these addresses, it's a good idea to use these addresses as you inside local addresses without fear of IP conflict (since Routers discard messages destined to the unregistered public IP Addresses). The range for each of the three IP Address Classes used to Local Networking: * Class A: 10.0.0.0 through 10.255.255.255 * Class B: 172.16.0.0 through 172.31.255.255 * Class C: 192.168.0.0 through 192.168.255.255

Subnets: a smaller network of computers connected to a larger network through a router: *can have its own address system so computers on the same subnet can communicate quickly without sending data across the larger network *a router on a TCP/IP network, including the Internet, is configured to recognize one or more subnets and route network traffic appropriately: * 10.0.0.0 to 10.255.255.255 -- This falls within the Class A address range of 1.0.0.0 to 127.0.0.0, in which the first bit is 0. * 172.16.0.0 to 172.31.255.255 -- This falls within the Class B address range of 128.0.0.0 to 191.255.0.0, in which the first two bits are 10. * 192.168.0.0 to 192.168.255.255 -- This falls within the Class C range of 192.0.0.0 through 223.255.255.0, in which the first three bits are 110. * Multicast (formerly called Class D) -- The first four bits in the address are 1110, with addresses ranging from 224.0.0.0 to 239.255.255.255. * Reserved for future/experimental use (formerly called Class E) -- addresses 240.0.0.0 to 254.255.255.254.

Class A, Class B, and Class C are most frequently used in creating subnets.

IANA is also responsible for assigning blocks of IP addresses to certain entities, usually commercial or government organizations: *Your Internet Service Provider (ISP) may be one of these entities, or it may be part of a larger block under the control of one of those entities. *ISP will assign you one of these addresses to connect you to the Internet.

Routers used to share a single Internet connection (the IP address assigned to you by your ISP) between multiple computers.

  • router gets the IP address issued directly from the ISP.
  • router creates and manages a subnet for all the computers connected to that router. If your computer's address falls into one of the reserved subnet ranges listed earlier, you're going through a router rather than connecting directly to the Internet.

IP addresses on a subnet have two parts, network and node: *network (router/server): identifies the subnet itself. *node(host client): also called the host, is an individual piece of computer equipment connected to the network and requiring a unique address.

subnet mask:

  • A subnet mask consists of such number that when converted to binary they represent a series of 1 bits followed by a series of 0 bits:
    • 255.0.0.0.0 = 11111111.00000000.00000000.00000000 = eight bits for networks, 24 bits for nodes
    • 255.192.0.0 = 11111111.11000000.00000000.00000000 = 10 bits for networks, 22 bits for nodes
    • 255.255.0.0 = 11111111.11111111.00000000.00000000 = 16 bits for networks, 16 bits for nodes
    • 255.255.255.0 = 11111111. 11111111.11111111.00000000 = 24 bits for networks, eight bits for nodes
  • Thus, if there are more subnets where each subnet uses very few hosts, than 255.255.255.0 mask would suite such network better since we allocate more addresses for subnets. If there are few subnets and many hosts, you might want to use 255.0.0.0 mask.

Note: for a given subnet address, the first address is reserve as the address of the subnet itself; the last address identifies the broadcast address for systems (all nodes/hosts?) on that subnet. Also, for a given host using 127.0.0.1 is talking to itself (loop back).

DNS

Domain Name System is an data exchange protocol on the internet (see TCP/IP). It's basically yellow pages data base which converts user friendly names like "www.someWebsiteName.com" to an actual IP address "129.2.16.3" In other words, it's a catalog of aliases. When your machine places a request to access a website or to send an email, the server performs DNS name resolution (the process of converting an alias to an actual IP address). This data base is distributed over million machines throughout the world and yet behaves like a single integrated data base. There is 1 of 2 tasks usually performed by a DNS server:

  • Maintain a small Database of Domain Names and IP Addresses most often used on its own network (store frequently used data locally to reduce the internet traffic from other DNS domains). This service is usually managed by your local ISP and consists of next requrests:
    • If it has the domain name and IP address in its database, it resolves the name itself.
    • If it doesn't have the domain name and IP address in its database, it contacts another DNS server on the Internet. It may have to do this multiple times.
    • If it has to contact another DNS server, it caches the lookup results for a limited time so it can quickly resolve subsequent requests to the same domain name.
    • If it has no luck finding the domain name after a reasonable search, it returns an error indicating that the name is invalid or doesn't exist.
  • Pair IP addresses with all the hosts and subdomains for which the DNS server has authority:
    • This category is associated with Web, mail and other internet domain hosting services.
    • Start of Authority (SOA):A DNS server that manages a specific domain.
    • Time To Live (TTL): when a host is being look up, the results containing its IP address propogate through other DNS servers. The time each server CACHEs the results for is known as TTL time which can range from a few minutes to a few days. Thus, when you look up an IP Address for www.someWebSite.com, the result of the DNS Resolution (its IP) for this website might be retrieved from your local server instead of the SOA server.
    • Root Name Servers are the DNS servers which start at the top of the domain hierarchy for a given top-level domain which are contacted as a last resort to help track down the SOA for a domain.

Note: whether an IPV4 or an IPV6 standard is used, each machine on the network has BOTH (IPV4 and IPV5) unique IP addresses.

Finding your IP Address: *Windows: ipconfig *Linux: ifconfig Note: Note that if you're on a home or small local network (subnet), your address will probably be in the form 192.168.x.x, 172.16.x.x or 10.x.x.x (where x is a number between 0 and 255). These are reserved addresses used on each local network, and a router on that network then connects you to the Internet.

Domain Name also known as a web address consists of a string of characters separated by dots:

  • The last word in a Domain Name represents a Top-Level Domain:
    • com commercial
    • org non-profit ogranizations
    • net network providers
    • mil military
    • gov government organizations
    • edu educational organizations
    • info entities providing information services. (see yana.org for complete list)
  • Each word and dot combination added before the Top-Level Domain indicates a level in the domain structure. Where each level refers to a server or a group of servers that manage that domain level. Example:
    • bbc.co.uk which is the BBC's domain under the CO domain under UK domain!
  • The left most word in the domain name (www or mail) is a Host Name which specified the name of a specific machine in a domain.

To create a new online domain: *find a unique domain name that isn't yet registered using Whois database. *register your new domain with a registrar. *You can host your domain either at a hosting company's DNS server, registrar's DNS server, or on your own DNS server.If you are hosting your domain at a hosting company instead of your registrar, configure the registrar to point your domain name to the correct host name or IP Address. When you are not hosting your own domain, but use a hosting company or a registrar, your domain is said to be a Parked Domain.

Network Address Translation (NAT) allows a single device (Router) to act as an agent between the Internet and the Local Network. Thus, only 1 unique IP Address is required to represent the whole Local Network by attaching that IP Address to the Router (as opposed to issuing an IP Address for each machine). NAT flavors: *Static NAT: Mapping an unregistered local IP address to a registered public IP address on a one-to-one basis (equal offset from the base IP address). Particularly useful when a device needs to be accessible from outside the network. *Dynamic NAT: Maps the first available unregistered local IP address to a registered Public IP address from a group of registered IP addresses: * Work Steps: * An internal network (stub domain) has been set up with IP addresses that were not specifically allocated to that company by IANA (Internet Assigned Numbers Authority), the global authority that hands out IP addresses. These addresses should be considered non-routable since they are not unique. * The company sets up a NAT-enabled router. The router has a range of unique IP addresses given to the company by IANA. * A computer on the stub domain attempts to connect to a computer outside the network, such as a Web server. * The router receives the packet from the computer on the stub domain. * The router saves the computer's non-routable IP address to an address translation table. The router replaces the sending computer's non-routable IP address with the first available IP address out of the range of unique IP addresses. The translation table now has a mapping of the computer's non-routable IP address matched with the one of the unique IP addresses. * When a packet comes back from the destination computer, the router checks the destination address on the packet. It then looks in the address translation table to see which computer on the stub domain the packet belongs to. It changes the destination address to the one saved in the address translation table and sends it to that computer. If it doesn't find a match in the table, it drops the packet. * The computer receives the packet from the router. The process repeats as long as the computer is communicating with the external system. *Overloading NAT: A form of dynamic NAT that maps multiple unregistered IP addresses to a single registered IP address by using different ports. This is known also as PAT (Port Address Translation), single address (aka unicast) NAT or port-level multiplexed (aka multicast) NAT. * This NAT Overloading type utilizes a multiplexing feature of the TCP/IP protocol stack which allows a computer to maintain several concurrent connections with remote computers using different TCP or UDP ports. When an IP Packet of data is sent it contains following information: * Source Address: the IP Address of the originating computer (ex: 129.204.40.15) * Source Port (16-bits): the TCP or UDP port number assigned by the originating computer for this packet (ex: Port 1080). * Destination Address: The IP address of the receiving computer (ex 204.65.135.57) * Destination Port(16-bits): the TCP or UDP port number that the originating compute is asking the receiving computer to open (ex: Port 3021). * NOTE: where Addresses specify the two machines at each end, and port numbers provide means of unique identification. * Work Steps: * An internal network (stub domain) has been set up with non-routable IP addresses that were not specifically allocated to that company by IANA. * The company sets up a NAT-enabled router. The router has a unique IP address given to the company by IANA. * A computer on the stub domain attempts to connect to a computer outside the network, such as a Web server. * The router receives the packet from the computer on the stub domain. * The router saves the computer's non-routable IP address and port number to an address translation table. The router replaces the sending computer's non-routable IP address with the router's IP address. The router replaces the sending computer's source port with the port number that matches where the router saved the sending computer's address information in the address translation table. The translation table now has a mapping of the computer's non-routable IP address and port number along with the router's IP address. * When a packet comes back from the destination computer, the router checks the destination port on the packet. It then looks in the address translation table to see which computer on the stub domain the packet belongs to. It changes the destination address and destination port to the ones saved in the address translation table and sends it to that computer. * The computer receives the packet from the router. The process repeats as long as the computer is communicating with the external system. * Since the NAT router now has the computer's source address and source port saved to the address translation table, it will continue to use that same port number for the duration of the connection. A timer is reset each time the router accesses an entry in the table. If the entry is not accessed again before the timer expires, the entry is removed from the table. *Overlapping NAT: When the IP addresses used on your internal network are registered IP addresses in use on another network, the router must maintain a lookup table of these addresses so that it can intercept them and replace them with registered unique IP addresses. It is important to note that the NAT router must translate the "internal" addresses to registered unique addresses as well as translate the "external" registered addresses to addresses that are unique to the private network. This can be done either through static NAT or by using DNS and implementing dynamic NAT.

LAN: The internal network is usually a LAN (Local Area Network), commonly referred to as the Stub Domain. A stub domain is a LAN that uses IP addresses internally. Most of the network traffic in a Stub Domain is local, so it doesn't travel outside the internal network. A stub domain can include both registered and unregistered IP addresses. Of course, any computers that use unregistered IP addresses must use Network Address Translation to communicate with the rest of the world.

The NAT Router is configured to translate unregistered IP Addresses (a.k.a. inside, local IP addresses that reside on the private/local/inside network) to registered IP addresses which is required when a device on a local network with unregistred IP address needs to communicate with the public network: *Inside Local Addresses: the group of the unregistered IP Addresses used by the Local Hosts. Most machines on the Local Network use this type of the IP Address to communicate with each other through the Subnet Devices. If a machine using this type of the IP Address needs to communicate to the Public Network, the communication has to be handled by the NAT Router. *Outside Local Addresses: The smaller group of unregistered IP Addresses used by the Subnet Device (NAT Router, or NAT Server) to represent data coming from the devices on Public Network. *Inside Global Addresses: the group of the registered IP Addresses assigned by the ISP, and to be used by the Subnet Device (NAT Router, or NAT Server) to communicate to the Public Network (outside world). If a machine on a Local Network communicates to the Public Network extensively, it might be assigned this type of the IP Address which takes the IP translation by the Subnet Device (NAT Router, or NAT Server) out of equation. *Outside Global Addresses: the group of the registered IP Addresses that represent devices on the Public Network (outside world).

When a computer on the Local Network (stub domain) wants to communicate outside the network: *the packet first goes to one of the NAT routers. *the NAT router checks the routing table to see if it has an entry for the destination address. If it does, the NAT router then translates the packet and creates an entry for it in the address translation table. If the destination address is not in the routing table, the packet is dropped. *If this Local Machine uses an Inside Global Address, the router does not need to translate it and sends the packet straight to its destination.

A computer on the public network sends a packet to the private network: *The source address on the packet is an outside global address. The destination address is an inside global address. *The NAT router looks at the address translation table and determines that the destination address is in there, mapped to a computer on the stub domain. *The NAT router translates the inside global address of the packet to the inside local address, and sends it to the destination computer.

WIKI:

Network Identifier & Host Identifier(aka rest field) \ Classful Networking: Class A, B, C, D, and E where: \ Class A, B and C differ in the number of bits allocated for new Network Identification Class D is used for Multicast Addressing Class E reserved for future application

VLSM Variable Length Subnet Mask \ CIDR Classless Inter-Domain Routing xx.xx.xx.xx/x \ IANA Internet Assigned Numbers Authority \ RIRs Reginal Internet Registries \

Special Use Addresses: \

Reserved address blocks \ Range Description Reference \ 0.0.0.0/8 Current network (only valid as source address) RFC 6890 \ 10.0.0.0/8 Private network RFC 1918 \ 100.64.0.0/10 Shared Address Space RFC 6598 \ 127.0.0.0/8 Loopback RFC 6890 \ 169.254.0.0/16 Link-local RFC 3927 \ 172.16.0.0/12 Private network RFC 1918 \ 192.0.0.0/24 IETF Protocol Assignments RFC 6890 \ 192.0.2.0/24 TEST-NET-1, documentation and examples RFC 5737 \ 192.88.99.0/24 IPv6 to IPv4 relay RFC 3068 \ 192.168.0.0/16 Private network RFC 1918 \ 198.18.0.0/15 Network benchmark tests RFC 2544 \ 198.51.100.0/24 TEST-NET-2, documentation and examples RFC 5737 \ 203.0.113.0/24 TEST-NET-3, documentation and examples RFC 5737 \ 224.0.0.0/4 IP multicast (former Class D network) RFC 5771 \ 240.0.0.0/4 Reserved (former Class E network) RFC 1700 \ 255.255.255.255 Broadcast \

The 3 ranges ofr addresses reserveed for Private Networks: \ Name Address range Number of addresses Classful description \ Largest CIDR block \ 24-bit block 10.0.0.0–10.255.255.255 16777216 Single Class A 10.0.0.0/8 \ 20-bit block 172.16.0.0–172.31.255.255 1048576 Contiguous range of 16 Class B blocks \ 172.16.0.0/12 \ 16-bit block 192.168.0.0–192.168.255.255 65536 Contiguous range of 256 Class C blocks \ 192.168.0.0/16 \

VPN Virtual Private Networks, and IP Tunnel used when private networks need to communicate with each other: \

Process C-States

Source http://www.hardwaresecrets.com/everything-you-need-to-know-about-the-cpu-c-states-power-saving-modes C1-State:

  • a.k.a. Halt State or Auto Halt State.
  • Enters this Halt State when HLT instruction is received.
  • CPU enters traditional Halt State and is completely idle.
  • The Internal CPU Clock signal is stopped.
  • The 2 units within the CPU whose clock signal isn't stopped are:
    • Bus Interface Unit
    • Advanced Programmable Interrupt Controller (APIC)
    • This allows the CPU to temporarily exit the Halt State when important request comes through the CPU External Bus.
    • Stop Clock Snoop State: CPU can temporarily leave the Halt (C1) state to deal with an important request coming from through the CPU external bus. After the CPU has handled the request, it goes automatically back to the C1-State

C2-State (aka Stop Grant):

  • Just like the C1-State but the CPU clock is cut when the STPCLK (Stop Clock) pin is set active instead of using the HLT instruction.
  • The Bus Interface Unit and APIC are still being fed a clock signal.
  • Stop Grant Snoop State: CPU can temporarily leave the Halt (C1) state to deal with an important request coming from through the CPU external bus. After the CPU has handled the request, it goes automatically back to the C2-State

C2E-State (aka Extended Stop Grant):

  • is the same as C2-State mode but also reduces voltage besides stopping the CPU Internal Clock.
  • when enabled, the CPU enters the C2E-State instead of C2-State.
  • Extended Stop Grant Snoop State: CPU can temporarily leave the Halt (C1) state to deal with an important request coming from through the CPU external bus. The CPU Clock is restored but the CPU Voltage is held low. After the CPU has handled the request, it goes automatically back to the C2E-State

C3-State (aka Sleep State):

  • is the same as C2-State mode but the clock is completely disabled including Bus Interface Unit and APIC meaning that in this State the CPU is unable to answer to importnat requests coming from the CPU External Bus nor Interrupts.
  • The way a CPU enters this mode, depends on manufacturer.
  • To enter this mode on Intel CPUs, the STPCLK pin is set active first, and then the SLP pin is set active second.
  • To enter this mode on AMD CPUs, the value from the ACPI register is read.
    • If the value is PLVL_2, the chipset will activate the STPCLK pin putting the CPU into C2 mode.
    • If the value is PLVL_3, the chipset will activate STPCLK pin putting the CPU into C3 mode.
  • The AMD CPUs support support a sub-mode of C-3 State called AltVID which also reduces CPU Voltage in addition to killing the clock (similar to Intel's C4 State).

C4-State (aka Deeper Sleep mode):

  • Kills all clock signals to CPU just like C3-State, but also reduces the CPU Voltage.

C4E/C5-State (aka Enhanced Deeper Sleep mode):

  • Kills all Clock Signals (from PLL) to CPU and reduces the CPU Voltage just like C4-State, but further reduces the CPU Voltage after the L2 Memory Cache has been disabled.

C6-State (aka Deep Power Down):

  • When the CPU enters this state it saves its entire architectural state inside a special static RAM, which is fed from an independent power source. This allows the CPU internal voltage to be lowered to any value, including 0 V, what would completely turn off the CPU when it is idle. Then when the CPU is waked up it loads the previous state of all internal units from its special static RAM. Of course waking up the CPU from this state takes a lot longer than the previous states we discussed, but it is faster than turning off the computer and then turning it back on and loading the operating system...

Short

Scripts

bwTestEthAllInfRunLog.sh:

[root@localhost bwTestStat]# cat bwTestEthAllInfRunLog.sh
#!/bin/bash

include_dir="$(dirname "$0")/includes/"
source ${include_dir}/include.sh

runTime='1'

window_qperf_statistics Eth3
window_qperf_statistics Eth4


hsNodes=(
hpc_node01
hpc_node02
hpc_node03
hpc_node04
hpc_node05
hpc_node06
hpc_node09
hpc_node10
hpc_node11
hpc_node12
hpc_node13
hpc_node14
)

#export LD_LIBRARY_PATH=""

for node in ${hsNodes[@]}
do
	lsTxInterface=$node #`echo $node|cut -d \- -f 1`	
	ssh $lsTxInterface "killall qperf"
done


exit_on_signal_SIGINT () {
	echo "Script interrupted" 2>&1
	for node in ${hsNodes[@]}
	do
		lsTxInterface=$node #`echo $node|cut -d \- -f 1`	
		ssh $lsTxInterface "killall qperf"
		if [[ $? == 0 ]]; then
			echo "Successfully killed qperf on $lsTxInterface"
		fi
	done
	echo "Exiting with status 4; SIGINT"
	killall window_qperf_statistics
	exit 4;
}

exit_on_signal_SIGTERM () {
	echo "Script terminated" 2>&1
	for node in ${hsNodes[@]}
	do
		lsTxInterface=$node `echo $node|cut -d \- -f 1`	
		ssh $lsTxInterface "killall qperf"
		if [[ $? == 0 ]]; then
			echo "Successfully killed qperf on $lsTxInterface"
		fi
	done
	echo "Exiting with status 5; SIGTERM"
	killall window_qperf_statistics
	exit 5;
}

trap exit_on_signal_SIGINT SIGINT
trap exit_on_signal_SIGTERM SIGTERM

while [[ 1 ]]; do
for rxNode in ${hsNodes[@]}
do
	lsRxInterface=$rxNode #`echo -n $rxNode|cut -d \- -f 1`
	for txNode in ${hsNodes[@]}
	do
		lsTxInterface=$txNode #`echo -n $txNode|cut -d \- -f 1`
		if [ "$txNode" != "$rxNode" ]; then
		
			#RUN ETH3 TEST ON 2 NODES:
			echo "#### #### #### #### #### #### #### #### #### #### #### ===" >> detailedbwTestEth3InfRun.log
			echo "$txNode to $rxNode" >> detailedbwTestEth3InfRun.log
			ssh $lsRxInterface numactl --physcpubind=9,10,11,12 --membind=0 qperf &
			if [[ $? != 0 ]]; then
				echo "exiting 1"
				exit 1;
			fi
			logged_result=$(ssh $lsTxInterface "numactl --physcpubind=9,10,11,12 --membind=0 qperf -t $runTime -ub -m 8972 ${rxNode}-Eth3 udp_bw")
			#echo "$logged_result" | tee -a $detailedLogFile
			echo "$logged_result" >> detailedbwTestEth3InfRun.log
		        echo "$logged_result" | grep "v_bw" >> bwTestEth3InfRun.log
			if [[ $? != 0 ]]; then
				echo "exiting 2"
				exit 2;
			fi
			ssh $lsRxInterface "killall qperf"
			if [[ $? != 0 ]]; then
				echo "exiting 3"
				exit 3;
			fi

			#RUN ETH4 TEST ON THE SAME 2 NODES:
			echo "#### #### #### #### #### #### #### #### #### #### #### ===" >> detailedbwTestEth4InfRun.log
			echo "$txNode to $rxNode" >> detailedbwTestEth4InfRun.log
			ssh $lsRxInterface numactl --physcpubind=24,25,26,27 --membind=0 qperf &
			if [[ $? != 0 ]]; then
				echo "exiting 1"
				exit 1;
			fi
			logged_result=$(ssh $lsTxInterface "numactl --physcpubind=24,25,26,27 --membind=1 qperf -t $runTime -ub -m 8972 ${rxNode}-Eth4 udp_bw")
			#echo "$logged_result" | tee -a $detailedLogFile
			echo "$logged_result" >> detailedbwTestEth4InfRun.log
		        echo "$logged_result" | grep "v_bw" >> bwTestEth4InfRun.log
			if [[ $? != 0 ]]; then
				echo "exiting 2"
				exit 2;
			fi
			ssh $lsRxInterface "killall qperf"
			if [[ $? != 0 ]]; then
				echo "exiting 3"
				exit 3;
			fi
		fi
	done
done
done	

exit 0;

bwTestEthXInfRunLog.sh:

[root@localhost bwTestStat]# cat bwTestEthXInfRunLog.sh
#!/bin/bash

runTime='1'

if [[ $# -ge 1 ]]; then
	ethX=$1
	echo "**************ITS AN $ethX********************"
	if [[ $1 == "eth3" ]]; then
		pcpus="9,10,11,12"
		logFile="log.bwTest${ethX}InfRun.txt"
		detailedLogFile="detailedLog.bwTest${ethX}InfRun.txt"
		numaBind=0
	fi
	if [[ $1 == "eth4" ]]; then
		pcpus="24,25,26,27"
		logFile="log.bwTest${ethX}InfRun.txt"
		detailedLogFile="detailedLog.bwTest${ethX}InfRun.txt"
		numaBind=1
	fi
else
	echo "Must pass an argument: \"$0 eth3\" or \"$0 eth4\" "
	exit
fi

#Since we use window_qperf_statistics function to watch the content of $logFile file, let's make sure the file exists before we run the function.
ls $logFile 
if [[ $? != 0 ]]; then
	touch $logFile
fi

window_qperf_statistics $ethX


hsNodes=(
hpc_node01-${ethX}
hpc_node02-${ethX}
hpc_node03-${ethX}
hpc_node04-${ethX}
hpc_node05-${ethX}
hpc_node06-${ethX}
hpc_node09-${ethX}
hpc_node10-${ethX}
hpc_node11-${ethX}
hpc_node12-${ethX}
hpc_node13-${ethX}
hpc_node14-${ethX}
)





#export LD_LIBRARY_PATH=""


for node in ${hsNodes[@]}
do
	lsTxInterface=`echo $node|cut -d \- -f 1`	
	ssh $lsTxInterface "killall qperf"
done


exit_on_signal_SIGINT () {
	echo "Script interrupted" 2>&1
	for node in ${hsNodes[@]}
	do
		lsTxInterface=`echo $node|cut -d \- -f 1`	
		ssh $lsTxInterface "killall qperf"
		if [[ $? == 0 ]]; then
			echo "Successfully killed qperf on $lsTxInterface"
		fi
	done
	echo "Exiting with status 4; SIGINT"
	killall window_qperf_statistics
	exit 4;
}

exit_on_signal_SIGTERM () {
	echo "Script terminated" 2>&1
	for node in ${hsNodes[@]}
	do
		lsTxInterface=`echo $node|cut -d \- -f 1`	
		ssh $lsTxInterface "killall qperf"
		if [[ $? == 0 ]]; then
			echo "Successfully killed qperf on $lsTxInterface"
		fi
	done
	echo "Exiting with status 5; SIGTERM"
	killall window_qperf_statistics
	exit 5;
}

trap exit_on_signal_SIGINT SIGINT
trap exit_on_signal_SIGTERM SIGTERM


#	echo ""
#	echo "#### #### #### #### #### #### #### #### #### #### #### ==="
#	echo "Testing all high speed interfaces against $lsRxInterface"

while [[ 1 ]]; do
for rxNode in ${hsNodes[@]}
do
	lsRxInterface=`echo -n $rxNode|cut -d \- -f 1`

#	echo ""
#	echo "#### #### #### #### #### #### #### #### #### #### #### ==="
#	echo "Testing all high speed interfaces against $lsRxInterface"

	for txNode in ${hsNodes[@]}
	do
		lsTxInterface=`echo -n $txNode|cut -d \- -f 1`
		if [ "$txNode" != "$rxNode" ]; then
			#echo "#### #### #### #### #### #### #### #### #### #### #### ==="
			#echo "$txNode to $rxNode" | tee -a $detailedLogFile
			echo "#### #### #### #### #### #### #### #### #### #### #### ===" >> $detailedLogFile
			echo "$txNode to $rxNode" >> $detailedLogFile
			ssh $lsRxInterface numactl --physcpubind=$pcpus --membind=$numaBind qperf &
			if [[ $? != 0 ]]; then
				echo "exiting 1"
				exit 1;
			fi
			logged_result=$(ssh $lsTxInterface "numactl --physcpubind=$pcpus --membind=$numaBind qperf -t $runTime -ub -m 8972 $rxNode udp_bw")
			#echo "$logged_result" | tee -a $detailedLogFile
			echo "$logged_result" >> $detailedLogFile
		        echo "$logged_result" | grep "v_bw" >> $logFile 
			if [[ $? != 0 ]]; then
				echo "exiting 2"
				exit 2;
			fi
			ssh $lsRxInterface "killall qperf"
			if [[ $? != 0 ]]; then
				echo "exiting 3"
				exit 3;
			fi
		fi
	done
done
done	



exit 0;

bwTestEthXTwoBlades.sh####

[root@localhost bwTestStat]# cat bwTestEthXTwoBlades.sh
#!/bin/bash

numberOfRepeats='1'
secs="1"

nic=`echo $1 | cut -d \- -f 2`
echo "$nic"

if [[ $# -ge 1 ]]; then
        echo "**************ITS AN $nic********************"
        if [[ $nic == "eth3" ]]; then
                pcpus="9,10,11,12"
  #              logFile="bwTestEth3InfRun.log"
                numaBind=0
        fi
        if [[ $nic == "eth4" ]]; then
                pcpus="24,25,26,27"
   #             logFile="bwTestEth4InfRun.log"
                numaBind=1
        fi
else
        echo "Must pass an argument: \"$0 hpc_node01-eth3 hpc_node02-eth3\" "
        exit
fi




repeatCounter=0

if [[ $# != 2 ]]; then
	echo "PROVIDE 2 NODES!"
	echo "Example: $0 hpc_node01-eth3 hpc_node02-eth3"
	exit 6;
fi

hsNodes=(
$1
$2
)


#export LD_LIBRARY_PATH=""


lsRxInterface=`echo $1 | cut -d \- -f 1`
#echo $lsRxInterface

exit_on_signal_SIGINT () {
	echo "Script interrupted" 2>&1
	for node in ${hsNodes[@]}
	do
		lsTxInterface=`echo $node|cut -d \- -f 1`	
		ssh $lsTxInterface "killall qperf"
		if [[ $? == 0 ]]; then
			echo "Successfully killed qperf on $lsTxInterface"
		fi
	done
	echo "Exiting with status 4; SIGINT"
	exit 4;
}

exit_on_signal_SIGTERM () {
	echo "Script terminated" 2>&1
	for node in ${hsNodes[@]}
	do
		lsTxInterface=`echo $node|cut -d \- -f 1`	
		ssh $lsTxInterface "killall qperf"
		if [[ $? == 0 ]]; then
			echo "Successfully killed qperf on $lsTxInterface"
		fi
	done
	echo "Exiting with status 5; SIGTERM"
	exit 5;
}

trap exit_on_signal_SIGINT SIGINT
trap exit_on_signal_SIGTERM SIGTERM

echo ""
echo "#### #### #### #### #### #### #### #### #### #### #### #### #### #### ="
echo "Testing all high speed interfaces against $lsRxInterface"
echo "#### #### #### #### #### #### #### #### #### #### #### #### #### #### ="
echo "$1 to $2"

while [[ $repeatCounter < $numberOfRepeats ]]; do
for rxNode in ${hsNodes[@]}
do
	lsRxInterface=`echo $rxNode|cut -d \- -f 1`


	for txNode in ${hsNodes[@]}
	do
		lsTxInterface=`echo $txNode|cut -d \- -f 1`
		if [ "$txNode" != "$rxNode" ]; then

			#ssh $lsRxInterface numactl --physcpubind=10,11,12,13 --membind=0 qperf &
			ssh $lsRxInterface numactl --physcpubind=$pcpus --membind=$numaBind qperf &
			if [[ $? != 0 ]]; then
				echo "exiting 1"
				exit 1;
			fi

			#ssh $lsTxInterface "numactl --physcpubind=10,11,12,13 --membind=0 qperf -v -t $secs -ub -m 8972 $rxNode udp_bw"
			ssh $lsTxInterface "numactl --physcpubind=$pcpus --membind=$numaBind qperf -v -t $secs -ub -m 8972 $rxNode udp_bw"
			if [[ $? != 0 ]]; then
				echo "exiting 2"
				exit 2;
			fi

			ssh $lsRxInterface "killall qperf"
			if [[ $? != 0 ]]; then
				echo "exiting 3"
				exit 3;
			fi
		fi
	done
done

repeatCounter=$((repeatCounter + 1))

done


exit 0;

includes/include.sh####

[root@localhost bwTestStat]# cat includes/include.sh 
# .bashrc
echo "INCLUDE SCRIPTS"

export CHROOTDIR=/opt/chroots/hpc_os_images/



function show_qperf_statistics ()
{
ethX=$1
logFile="/root/bwTestStat/log.bwTest${ethX}InfRun.txt"

receivedGigaBits=({45..0})

count=0

for receivedResult in ${receivedGigaBits[@]}
do
cat $logFile | grep $receivedResult >> /dev/null;
	if [[ $? == 0 ]]; then
	list=$(cat $logFile | egrep -o " ${receivedResult}[\.]{1,1}| ${receivedResult} " | awk '{for(i=1;i<=NF;i++)printf "%.0f ",$i;print ""}'); 
	for entry in ${list[@]}
	do
		if [[ $entry == $receivedResult ]]; then
		count=$((count + 1))
		fi
	done
	if [[ $entry == $receivedResult ]]; then
		echo "$receivedResult Gb/s: $count"
	fi
	count=0
	fi
done

	return
}

export -f show_qperf_statistics



function show_network_RB_info ()
{
#!/bin/bash

if [[ $# == 2 ]]; then
	ssh $1 ethtool $2 
	ssh $1 ethtool -c $2 
	ssh $1 ethtool -i $2 
	ssh $1 ethtool -g $2 
	ssh $1 ethtool -l $2 
	ssh $1 sysctl net.core
elif [[ $# == 1 ]]; then
	ethtool $1
        ethtool -c $1
        ethtool -i $1
        ethtool -g $1
        ethtool -l $1
        sysctl net.core
else
	echo "provide nodes, example: $0 hpc_node01 eth3"
fi
	return
}

export -f show_network_RB_info



#THIS FUNCTION PRINTS NIC INFORMATION:
function show_ethX_info ()
{
if [[ $# == 2 ]]; then
	ssh $1 ethtool $2 
	ssh $1 ethtool -c $2 
	ssh $1 ethtool -i $2 
	ssh $1 ethtool -g $2 
	ssh $1 ethtool -l $2 
	echo -n "$2 is on NUMA node"
	ssh $1 "cat /sys/class/net/$2/device/numa_node"
	ssh $1 numactl -H
	ssh $1 sysctl net.core
elif [[ $# == 1 ]]; then

	ethtool $1 
	ethtool -c $1 
	ethtool -i $1 
	ethtool -g $1 
	ethtool -l $1 

	if [[ $(cat /sys/class/net/$1/device/numa_node) == "-1" ]]; then
		echo "No numa nodes exist"
	else 
		echo -n "$1 is on NUMA node "
		cat /sys/class/net/$1/device/numa_node
	fi

	numactl -H
	sysctl net.core
else
	echo "provide nodes, example: \" $0 hpc_node01 eth3 \" or interface name \" $0 em1 \" "
	echo "provide nodes, example: \" $0 hpc_node01 eth3 \" or interface name \" $0 em1 \" "
fi
	return
}

export -f show_ethX_info



#THIS FUNCTION PRINTS CPU INFORMATION:
function show_cpu_info ()
{
if [[ $# == 0 ]]; then

	cat /proc/cpuinfo
	lscpu

	# bios, system, baseboard, chassis, processor, memory, cache, connector, slot
	# /dev/mem
	dmidecode -t processor

	#ll /sys/devices/system/cpu/
	#ll /sys/devices/system/cpu/cpu[NUM]/cpufreq/
	# /proc/


	cpupower frequency-info || cpufreq-info
	if [[ $? != 0 ]]; then
		echo "Consider Installing \"cpupower\" or \"cpufreq-utils\""
	fi

	numactl -H

elif [[ $# == 1 ]]; then

	ssh $1 "cat /proc/cpuinfo"
	ssh $1 "lscpu"

	# bios, system, baseboard, chassis, processor, memory, cache, connector, slot
	# /dev/mem
	ssh $1 "dmidecode -t processor"

	#ll /sys/devices/system/cpu/
	#ll /sys/devices/system/cpu/cpu[NUM]/cpufreq/
	# /proc/


	ssh $1 "cpupower frequency-info || cpufreq-info"
	if [[ $? != 0 ]]; then
		echo "Consider Installing \"cpupower\" or \"cpufreq-utils\""
	fi

	ssh $1 "numactl -H"

else
	echo "provide nodes, example: $0 hpc_node01"
fi
	return
}

export -f show_cpu_info



function window_ethtool_stat ()
{
interf="eth3"

if [[ $# == 1 ]]; then
#	watch -d -n 0.1 ssh $1 'ethtool --statistics eth3 | grep -E "tx_packets|tx_bytes|rx_packets|rx_bytes|errors|dropped|collisions|aggregated|flushed|stopped|timeout|fail|filtered|clean"'
#	watch -d -n 0.1 ssh $1 "ethtool --statistics $interf | grep -E \"tx_packets|tx_bytes|rx_packets|rx_bytes|errors|dropped|collisions|aggregated|flushed|stopped|timeout|fail|filtered|clean\""
	gnome-terminal --geometry=28x32+2545+280 --title="$1 ETHTOOL STAT" --window-with-profile="minimized" -x bash -c "watch -d -n 0.1 ssh $1 \"ethtool --statistics $interf | grep -E 'tx_packets|tx_bytes|rx_packets|rx_bytes|errors|dropped|collisions|aggregated|flushed|stopped|timeout|fail|filtered|clean'\""
else
	echo "An argument is required: window_ethtool_stat hpc_node01"
fi
	return
}

export -f window_ethtool_stat 



function window_netstat ()
{
interf=eth3

if [[ $# == 1 ]]; then
#	watch -d -n 0.1 ssh $1 "netstat --interfaces=$interf; netstat --statistics --udp"
	
	gnome-terminal --geometry=85x25+1300+280 --title="$1 NETSTAT" --window-with-profile="minimized" -x bash -c "watch -d -n 0.1 ssh $1 \"netstat --interfaces=eth3; netstat --statistics --udp\""
else
	echo "An argument is required: window_ethtool_stat hpc_node01"
fi
	return
}

export -f window_netstat 



function window_proc_interrupts ()
{
interf=eth3


if [[ $# == 1 ]]; then
	#watch -d -n 0.1 ssh $1 "egrep \"CPU\|$interf\" /proc/interrupts"
	gnome-terminal --geometry=345x5+0+0 --title="$1 PROC IRQS" --profile="minimized" -x bash -c "watch -d -n 0.1 ssh $1 \"egrep 'CPU\|eth3' /proc/interrupts\""
else
	echo "An argument is required: window_proc_interrupts hpc_node01"
fi
	return
}

export -f window_proc_interrupts 



function window_proc_softirqs ()
{
interf="eth3"

if [[ $# == 1 ]]; then
	#watch -d -n 1 ssh $1 "egrep 'CPU\|TX\|RX' /proc/softirqs"
	gnome-terminal --geometry=325x5+2450+0 --title="$1 SOFT IRQS" --window-with-profile="minimized" -x bash -c "watch -d -n 1 ssh $1 \"egrep 'CPU\|TX\|RX' /proc/softirqs\""
	
else
	echo "PROBLEMA in window_ethtool_stat.sh!!!"
fi

return
}

export -f window_proc_softirqs



function window_sensors ()
{
interf=eth3

if [[ $# == 1 ]]; then
	#watch -d -n 0.1 ssh $1 "sensors"
	gnome-terminal --geometry=59x37+2970+280 --title="$1 SENSORS" --window-with-profile="minimized" -x bash -c "watch -d -n 0.1 ssh $1 \"sensors\""
else
	echo "PROBLEMA: need argument to be passed to netstat.sh "
fi

return
}

export -f window_sensors



function window_softnet_stat ()
{
if [[ $# == 1 ]]; then
	#watch -d -n 0.1 ssh $1 cat /proc/net/softnet_stat
	gnome-terminal --geometry=90x30+0+280 --title="$1 SOFTNET STAT" --window-with-profile="minimized" -x bash -c "watch -d -n 0.1 ssh $1 cat /proc/net/softnet_stat"
else
        echo "PROBLEMA window_softnet_stat.sh!!!"
fi

return
}

export -f window_softnet_stat


function window_qperf_statistics ()
{
ethX=$1
#if [[ $# == 1 ]]; then
	gnome-terminal --geometry=64x30+0+280 --title="$1 QPERF STATISTICS" --window-with-profile="minimized" -x bash -c "watch -d -n 0.1 show_qperf_statistics $ethX"
#else
#        echo "PROBLEMA window_softnet_stat.sh!!!"
#fi

return
}

export -f window_qperf_statistics



function open_windows_3_displays ()
{
#window_proc_interrupts hpc_node02

if [[ $# == 2 ]]; then
#	gnome-terminal --geometry=345x5+0+0 --title="$1 PROC IRQS" --profile="minimized" -x bash -c "window_proc_interrupts $1"
#	gnome-terminal --geometry=345x5+0+150 --title="$2 PROC IRQS" --window-with-profile="minimized" -x bash -c "window_proc_interrupts $2"
	window_proc_interrupts $1
	window_proc_interrupts $2
#	gnome-terminal --geometry=325x5+2450+0 --title="$1 SOFT IRQS" --window-with-profile="minimized" -x bash -c "window_proc_softirqs $1"
#	gnome-terminal --geometry=326x5+2450+155 --title="$2 SOFT IRQS" --window-with-profile="minimized" -x bash -c "window_proc_softirqs $2"
	window_proc_softirqs $1
	window_proc_softirqs $2
#	gnome-terminal --geometry=90x30+0+280 --title="$1 SOFTNET STAT" --window-with-profile="minimized" -x bash -c "window_softnet_stat $1"
#	gnome-terminal --geometry=90x30+650+280 --title="$2 SOFTNET STAT" --window-with-profile="minimized" -x bash -c "window_softnet_stat $2"
	window_softnet_stat $1
	window_softnet_stat $2
#	gnome-terminal --geometry=85x25+1300+280 --title="$1 NETSTAT" --window-with-profile="minimized" -x bash -c "window_netstat $1"
#	gnome-terminal --geometry=85x25+1925+280 --title="$2 NETSTAT" --window-with-profile="minimized" -x bash -c "window_netstat $2"
	window_netstat $1
	window_netstat $2	
#	gnome-terminal --geometry=28x32+2545+280 --title="$1 ETHTOOL STAT" --window-with-profile="minimized" -x bash -c "window_ethtool_stat $1"
#	gnome-terminal --geometry=28x32+2750+280 --title="$2 ETHTOOL STAT" --window-with-profile="minimized" -x bash -c "window_ethtool_stat $2"
	window_ethtool_stat $1
	window_ethtool_stat $2

#	gnome-terminal --geometry=59x37+2970+280 --title="$1 SENSORS" --window-with-profile="minimized" -x bash -c "window_sensors $1"	
#	gnome-terminal --geometry=59x37+3400+280 --title="$2 SENSORS" --window-with-profile="minimized" -x bash -c "window_sensors $2"	
	window_sensors $1
	window_sensors $2

else
	echo "provide nodes, example: $0  hpc_node01 hpc_node02 !"
fi

return
}



function set_hpc_network_params ()
{
wwsh ssh hpc_node[$1-$2] "ethtool -L eth3 tx 1 rx 1"
wwsh ssh hpc_node[$1-$2] "ethtool -G eth3 rx 8192 tx 4096"
wwsh ssh hpc_node[$1-$2] "sysctl -w net.core.netdev_budget=600"
return
}

function pingnodes ()
{
nodes=(
c2node{01...06}
c2node{09...14}
)

for anode in ${nodes[@]}
do
ssh $i "exit"
  if [[ $? == 0 ]]; then
    echo "$anode is online"
  else
    echo "!!!$anode is offline!!!"
  fi
done
return
}
⚠️ **GitHub.com Fallback** ⚠️