[ Name = strcat(MyType,"MonitorSelfResidentSetSize"); Value = MonitorSelfResidentSetSize; Desc = "RAM allocated to this daemon"; Units = "bytes"; Scale = 1024; Type = "float"; Requirements = name!="chtc_ccb@cm.chtc.wisc.edu"; Machine = "cm.chtc.wisc.edu"; TargetType = "Negotiator,Collector"; ] [ Name = strcat(MyType,"MonitorSelfImageSize"); Value = MonitorSelfImageSize; Desc = "Memory allocated to this daemon (i.e. virtual image size)"; Units = "bytes"; Scale = 1024; Type = "float"; Requirements = name!="chtc_ccb@cm.chtc.wisc.edu"; Machine = "cm.chtc.wisc.edu"; TargetType = "Negotiator,Collector"; ] [ Name = strcat(MyType,"MonitorSelfResidentSetSize"); Value = MonitorSelfResidentSetSize; Desc = "RAM allocated to this daemon"; Units = "bytes"; Scale = 1024; Type = "float"; TargetType = "Scheduler"; ] [ Aggregate = "SUM"; Name = "CpusInUse_SingleCore"; Desc = "Number of CPU cores actively running singlecore jobs"; Value = Cpus; Units = "cores"; Requirements = (State=="Claimed" || State=="Preempting") && Cpus==1 && Activity!="Idle" && Activity!="Suspended"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusInUse_MultiCore"; Desc = "Number of CPU cores actively running multicore jobs"; Value = Cpus; Units = "cores"; Requirements = (State=="Claimed" || State=="Preempting") && Cpus>1 && Activity!="Idle" && Activity!="Suspended"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusInUse"; Desc = "Number of CPU cores actively running jobs"; Value = Cpus; Units = "cores"; Requirements = (State=="Claimed" || State=="Preempting") && Activity!="Idle" && Activity!="Suspended"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusNotInUse"; Desc = "Number of CPU cores not actively running jobs"; Value = Cpus; Units = "cores"; Requirements = ((State=="Claimed" || State=="Preempting") && Activity!="Idle" && Activity!="Suspended")==False; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusNotInUse_LowMemory"; Desc = "Cores inactive because server has less than 1GB free"; Value = Cpus; Units = "cores"; Requirements = State=="Unclaimed" && Memory < 1024; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusNotInUse_Draining"; Desc = "Cores inactive because server is being drained"; Value = Cpus; Units = "cores"; Requirements = State=="Drained"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusNotInUse_ClaimedIdle"; Desc = "Cores inactive because a job not yet assigned to a claimed slot"; Value = Cpus; Units = "cores"; Requirements = State=="Claimed" && Activity=="Idle"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusNotInUse_Owner"; Desc = "Cores inactive because server declared unavailable by the owner"; Value = Cpus; Units = "cores"; Requirements = State=="Owner"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusNotInUse_Suspended"; Desc = "Cores inactive because job assigned to that core is suspended"; Value = Cpus; Units = "cores"; Requirements = Activity=="Suspended"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusNotInUse_MatchedState"; Desc = "Cores inactive because a matched slot has not yet been claimed"; Value = Cpus; Units = "cores"; Requirements = State=="Matched"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusNotInUse_NoJobsMatch"; Desc = "Cores inactive because no jobs match to this machine"; Value = Cpus; Units = "cores"; Requirements = State=="Unclaimed" && Memory >= 256; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusInUse_MultiCore02"; Desc = "Number of CPU cores actively running 2 core jobs"; Value = Cpus; Units = "cores"; Requirements = (State=="Claimed" || State=="Preempting") && Cpus =?= 2 && Activity!="Idle" && Activity!="Suspended"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusInUse_MultiCore04"; Desc = "Number of CPU cores actively running 4 core jobs"; Value = Cpus; Units = "cores"; Requirements = (State=="Claimed" || State=="Preempting") && Cpus > 2 && Cpus <= 4 && Activity!="Idle" && Activity!="Suspended"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusInUse_MultiCore08"; Desc = "Number of CPU cores actively running 8 core jobs"; Value = Cpus; Units = "cores"; Requirements = (State=="Claimed" || State=="Preempting") && Cpus > 4 && Cpus <= 8 && Activity!="Idle" && Activity!="Suspended"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusInUse_MultiCore16"; Desc = "Number of CPU cores actively running 16 core jobs"; Value = Cpus; Units = "cores"; Requirements = (State=="Claimed" || State=="Preempting") && Cpus > 8 && Cpus <= 16 && Activity!="Idle" && Activity!="Suspended"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusInUse_MultiCore20"; Desc = "Number of CPU cores actively running 20 core jobs"; Value = Cpus; Units = "cores"; Requirements = (State=="Claimed" || State=="Preempting") && Cpus > 16 && Cpus <= 20 && Activity!="Idle" && Activity!="Suspended"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusInUse_MultiCore32"; Desc = "Number of CPU cores actively running 32 core jobs"; Value = Cpus; Units = "cores"; Requirements = (State=="Claimed" || State=="Preempting") && Cpus > 20 && Cpus <= 32 && Activity!="Idle" && Activity!="Suspended"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "CpusInUse_MultiCore64"; Desc = "Number of CPU cores actively running 64 core jobs"; Value = Cpus; Units = "cores"; Requirements = (State=="Claimed" || State=="Preempting") && Cpus > 32 && Cpus <= 64 && Activity!="Idle" && Activity!="Suspended"; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryInUse"; Desc = "Amount of memory provisioned for running jobs"; Value = Memory; Units = "megabytes"; Requirements = (State=="Claimed" || State=="Preempting") && Activity!="Idle" && Activity!="Suspended" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "SlotsInUseByMem_000.5"; Desc = "Active slots with 512MB memory"; Value = 1; Units = "slots"; Requirements = (State=="Claimed" || State=="Preempting") && Memory > 0 && Memory <= 512 && Activity!="Idle" && Activity!="Suspended" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "SlotsInUseByMem_001"; Desc = "Active slots with 1GB memory"; Value = 1; Units = "slots"; Requirements = (State=="Claimed" || State=="Preempting") && Memory > 512 && Memory <= 1k && Activity!="Idle" && Activity!="Suspended" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "SlotsInUseByMem_002"; Desc = "Active slots with 2GB memory"; Value = 1; Units = "slots"; Requirements = (State=="Claimed" || State=="Preempting") && Memory > 1k && Memory <= 2k && Activity!="Idle" && Activity!="Suspended" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "SlotsInUseByMem_004"; Desc = "Active slots with 4GB memory"; Value = 1; Units = "slots"; Requirements = (State=="Claimed" || State=="Preempting") && Memory > 2k && Memory <= 4k && Activity!="Idle" && Activity!="Suspended" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "SlotsInUseByMem_008"; Desc = "Active slots with 8GB memory"; Value = 1; Units = "slots"; Requirements = (State=="Claimed" || State=="Preempting") && Memory > 4k && Memory <= 8k && Activity!="Idle" && Activity!="Suspended" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "SlotsInUseByMem_999"; Desc = "Active slots with 999GB memory"; Value = 1; Units = "slots"; Requirements = (State=="Claimed" || State=="Preempting") && Memory > 8k && Memory <= 999k && Activity!="Idle" && Activity!="Suspended" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryInUse_000.5"; Desc = "Total memory provisioned into 512MB slots"; Value = Memory; Units = "megabytes"; Requirements = (State=="Claimed" || State=="Preempting") && Memory > 0 && Memory <= 512 && Activity!="Idle" && Activity!="Suspended" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryInUse_001"; Desc = "Total memory provisioned into 1GB slots"; Value = Memory; Units = "megabytes"; Requirements = (State=="Claimed" || State=="Preempting") && Memory > 512 && Memory <= 1k && Activity!="Idle" && Activity!="Suspended" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryInUse_002"; Desc = "Total memory provisioned into 2GB slots"; Value = Memory; Units = "megabytes"; Requirements = (State=="Claimed" || State=="Preempting") && Memory > 1k && Memory <= 2k && Activity!="Idle" && Activity!="Suspended" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryInUse_004"; Desc = "Total memory provisioned into 4GB slots"; Value = Memory; Units = "megabytes"; Requirements = (State=="Claimed" || State=="Preempting") && Memory > 2k && Memory <= 4k && Activity!="Idle" && Activity!="Suspended" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryInUse_008"; Desc = "Total memory provisioned into 8GB slots"; Value = Memory; Units = "megabytes"; Requirements = (State=="Claimed" || State=="Preempting") && Memory > 4k && Memory <= 8k && Activity!="Idle" && Activity!="Suspended" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryInUse_999"; Desc = "Total memory provisioned into 999GB slots"; Value = Memory; Units = "megabytes"; Requirements = (State=="Claimed" || State=="Preempting") && Memory > 8k && Memory <= 999k && Activity!="Idle" && Activity!="Suspended" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryNotInUse"; Desc = "Amount of memory not actively running jobs"; Value = Memory; Units = "megabytes"; Requirements = ((State=="Claimed" || State=="Preempting") && Activity!="Idle" && Activity!="Suspended")==False && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryNotInUse_NoCores"; Desc = "Memory inactive because server has no cores free"; Value = Memory; Units = "megabytes"; Requirements = State=="Unclaimed" && Cpus < 1 && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryNotInUse_Draining"; Desc = "Memory inactive because server is being drained"; Value = Memory; Units = "megabytes"; Requirements = State=="Drained" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryNotInUse_ClaimedIdle"; Desc = "Memory inactive because a job not yet assigned to a claimed slot"; Value = Memory; Units = "megabytes"; Requirements = State=="Claimed" && Activity=="Idle" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryNotInUse_Owner"; Desc = "Memory inactive because server declared unavailable by the owner"; Value = Memory; Units = "megabytes"; Requirements = State=="Owner" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryNotInUse_Suspended"; Desc = "Memory inactive because job assigned to that core is suspended"; Value = Memory; Units = "megabytes"; Requirements = Activity=="Suspended" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryNotInUse_MatchedState"; Desc = "Memory inactive because a matched slot has not yet been claimed"; Value = Memory; Units = "megabytes"; Requirements = State=="Matched" && TotalMemory<500k; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MemoryNotInUse_NoJobsMatch"; Desc = "Memory inactive because no jobs match to this machine"; Value = Memory; Units = "megabytes"; Requirements = State=="Unclaimed" && Cpus >= 1 && TotalMemory<500k; TargetType = "Machine"; ] [ Name = strcat(MyType,"MachinesWhole"); Value = WholeMachines; Desc = "Number of machines that were observed to be defragmented in the last polling interval"; Units = "machines"; TargetType = "Defrag"; ] [ Name = strcat(MyType,"MachinesDraining"); Value = MachinesDraining; Desc = "Number of machines that were observed to be draining in the last polling interval"; Units = "machines"; TargetType = "Defrag"; ] [ Aggregate = "AVG"; Name = "DrainingStateTimeAvg"; Desc = "Average time a machine in the pool is in draining state"; Value = (time()-EnteredCurrentState)/60.0; Units = "minutes"; Requirements = State=="Drained"; TargetType = "Machine"; ] [ Aggregate = "MIN"; Name = "DrainingStateTimeMin"; Desc = "Minimum time a machine in the pool is in draining state"; Value = (time()-EnteredCurrentState)/60.0; Units = "minutes"; Requirements = State=="Drained"; TargetType = "Machine"; ] [ Aggregate = "MAX"; Name = "DrainingStateTimeMax"; Desc = "Max time a machine in the pool is in draining state"; Value = (time()-EnteredCurrentState)/60.0; Units = "minutes"; Requirements = State=="Drained"; TargetType = "Machine"; ] [ Aggregate = "AVG"; Name = "MemoryEfficiency"; Desc = "Average percent of provisioned slot memory that is in use"; Value = real(memoryusage)/memory*100; Units = "percent"; Requirements = MemoryUsage > 0.0 && (real(memoryusage)/memory*100)<=100.0; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineDrainedUnclaimedCores_01"; Desc = "Distribution of drained unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && State!="Drained" && Cpus > 0 && Cpus <= 1; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineDrainedUnclaimedCores_02"; Desc = "Distribution of drained unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && State!="Drained" && Cpus > 1 && Cpus <= 2; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineDrainedUnclaimedCores_04"; Desc = "Distribution of drained unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && State!="Drained" && Cpus > 2 && Cpus <= 4; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineDrainedUnclaimedCores_08"; Desc = "Distribution of drained unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && State!="Drained" && Cpus > 4 && Cpus <= 8; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineDrainedUnclaimedCores_16"; Desc = "Distribution of drained unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && State!="Drained" && Cpus > 8 && Cpus <= 16; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineDrainedUnclaimedCores_20"; Desc = "Distribution of drained unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && State!="Drained" && Cpus > 16 && Cpus <= 20; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineDrainedUnclaimedCores_32"; Desc = "Distribution of drained unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && State!="Drained" && Cpus > 20 && Cpus <= 32; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineDrainedUnclaimedCores_48"; Desc = "Distribution of drained unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && State!="Drained" && Cpus > 32 && Cpus <= 48; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineDrainedUnclaimedCores_80"; Desc = "Distribution of drained unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && State!="Drained" && Cpus > 48 && Cpus <= 80; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineUnclaimedCores_01"; Desc = "Distribution of unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && Cpus > 0 && Cpus <= 1; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineUnclaimedCores_02"; Desc = "Distribution of unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && Cpus > 1 && Cpus <= 2; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineUnclaimedCores_04"; Desc = "Distribution of unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && Cpus > 2 && Cpus <= 4; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineUnclaimedCores_08"; Desc = "Distribution of unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && Cpus > 4 && Cpus <= 8; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineUnclaimedCores_16"; Desc = "Distribution of unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && Cpus > 8 && Cpus <= 16; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineUnclaimedCores_20"; Desc = "Distribution of unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && Cpus > 16 && Cpus <= 20; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineUnclaimedCores_32"; Desc = "Distribution of unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && Cpus > 20 && Cpus <= 32; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineUnclaimedCores_48"; Desc = "Distribution of unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && Cpus > 32 && Cpus <= 48; TargetType = "Machine"; ] [ Aggregate = "SUM"; Name = "MachineUnclaimedCores_80"; Desc = "Distribution of unclaimed cores per machine"; Value = 1; Units = "machines"; Requirements = PartitionableSlot && Cpus > 48 && Cpus <= 80; TargetType = "Machine"; ]