/* * Copyright (c) Contributors, http://opensimulator.org/ * See CONTRIBUTORS.TXT for a full list of copyright holders. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the name of the OpenSimulator Project nor the * names of its contributors may be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE DEVELOPERS ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ using System; using System.Collections.Generic; using System.Linq; using System.Threading; using log4net; namespace OpenSim.Framework.Monitoring { /// /// Manages launching threads and keeping watch over them for timeouts /// public static class Watchdog { /// Timer interval in milliseconds for the watchdog timer public const double WATCHDOG_INTERVAL_MS = 2500.0d; /// Default timeout in milliseconds before a thread is considered dead public const int DEFAULT_WATCHDOG_TIMEOUT_MS = 5000; [System.Diagnostics.DebuggerDisplay("{Thread.Name}")] public class ThreadWatchdogInfo { public Thread Thread { get; private set; } /// /// Approximate tick when this thread was started. /// /// /// Not terribly good since this quickly wraps around. /// public int FirstTick { get; private set; } /// /// Last time this heartbeat update was invoked /// public int LastTick { get; set; } /// /// Number of milliseconds before we notify that the thread is having a problem. /// public int Timeout { get; set; } /// /// Is this thread considered timed out? /// public bool IsTimedOut { get; set; } /// /// Will this thread trigger the alarm function if it has timed out? /// public bool AlarmIfTimeout { get; set; } /// /// Method execute if alarm goes off. If null then no alarm method is fired. /// public Func AlarmMethod { get; set; } /// /// Stat structure associated with this thread. /// public Stat Stat { get; set; } public ThreadWatchdogInfo(Thread thread, int timeout) { Thread = thread; Timeout = timeout; FirstTick = Environment.TickCount & Int32.MaxValue; LastTick = FirstTick; Stat = new Stat( thread.Name, string.Format("Last update of thread {0}", thread.Name), "", "ms", "server", "thread", StatType.Pull, MeasuresOfInterest.None, stat => stat.Value = Environment.TickCount & Int32.MaxValue - LastTick, StatVerbosity.Debug); StatsManager.RegisterStat(Stat); } public ThreadWatchdogInfo(ThreadWatchdogInfo previousTwi) { Thread = previousTwi.Thread; FirstTick = previousTwi.FirstTick; LastTick = previousTwi.LastTick; Timeout = previousTwi.Timeout; IsTimedOut = previousTwi.IsTimedOut; AlarmIfTimeout = previousTwi.AlarmIfTimeout; AlarmMethod = previousTwi.AlarmMethod; } public void Cleanup() { StatsManager.DeregisterStat(Stat); } } /// /// This event is called whenever a tracked thread is /// stopped or has not called UpdateThread() in time< /// /summary> public static event Action OnWatchdogTimeout; public static JobEngine JobEngine { get; private set; } /// /// Is this watchdog active? /// public static bool Enabled { get { return m_enabled; } set { // m_log.DebugFormat("[MEMORY WATCHDOG]: Setting MemoryWatchdog.Enabled to {0}", value); if (value == m_enabled) return; m_enabled = value; if (m_enabled) { // Set now so we don't get alerted on the first run LastWatchdogThreadTick = Environment.TickCount & Int32.MaxValue; } m_watchdogTimer.Enabled = m_enabled; } } private static bool m_enabled; private static readonly ILog m_log = LogManager.GetLogger(System.Reflection.MethodBase.GetCurrentMethod().DeclaringType); private static Dictionary m_threads; private static System.Timers.Timer m_watchdogTimer; /// /// Last time the watchdog thread ran. /// /// /// Should run every WATCHDOG_INTERVAL_MS /// public static int LastWatchdogThreadTick { get; private set; } static Watchdog() { JobEngine = new JobEngine(); m_threads = new Dictionary(); m_watchdogTimer = new System.Timers.Timer(WATCHDOG_INTERVAL_MS); m_watchdogTimer.AutoReset = false; m_watchdogTimer.Elapsed += WatchdogTimerElapsed; } /// /// Start a new thread that is tracked by the watchdog timer. /// /// The method that will be executed in a new thread /// A name to give to the new thread /// Priority to run the thread at /// True to run this thread as a background thread, otherwise false /// Trigger an alarm function is we have timed out /// If true then creation of thread is logged. /// The newly created Thread object public static Thread StartThread( ThreadStart start, string name, ThreadPriority priority, bool isBackground, bool alarmIfTimeout, bool log = true) { return StartThread(start, name, priority, isBackground, alarmIfTimeout, null, DEFAULT_WATCHDOG_TIMEOUT_MS, log); } /// /// Start a new thread that is tracked by the watchdog /// /// The method that will be executed in a new thread /// A name to give to the new thread /// Priority to run the thread at /// True to run this thread as a background /// thread, otherwise false /// Trigger an alarm function is we have timed out /// /// Alarm method to call if alarmIfTimeout is true and there is a timeout. /// Normally, this will just return some useful debugging information. /// /// Number of milliseconds to wait until we issue a warning about timeout. /// If true then creation of thread is logged. /// The newly created Thread object public static Thread StartThread( ThreadStart start, string name, ThreadPriority priority, bool isBackground, bool alarmIfTimeout, Func alarmMethod, int timeout, bool log = true) { Thread thread = new Thread(start); thread.Name = name; thread.Priority = priority; thread.IsBackground = isBackground; ThreadWatchdogInfo twi = new ThreadWatchdogInfo(thread, timeout) { AlarmIfTimeout = alarmIfTimeout, AlarmMethod = alarmMethod }; if (log) m_log.DebugFormat( "[WATCHDOG]: Started tracking thread {0}, ID {1}", twi.Thread.Name, twi.Thread.ManagedThreadId); lock (m_threads) m_threads.Add(twi.Thread.ManagedThreadId, twi); thread.Start(); return thread; } /// /// Run the callback in a new thread immediately. If the thread exits with an exception log it but do /// not propogate it. /// /// Code for the thread to execute. /// Name of the thread /// Object to pass to the thread. public static void RunInThread(WaitCallback callback, string name, object obj, bool log = false) { if (Util.FireAndForgetMethod == FireAndForgetMethod.RegressionTest) { Culture.SetCurrentCulture(); callback(obj); return; } ThreadStart ts = new ThreadStart(delegate() { try { Culture.SetCurrentCulture(); callback(obj); Watchdog.RemoveThread(log:false); } catch (Exception e) { m_log.Error(string.Format("[WATCHDOG]: Exception in thread {0}.", name), e); } }); StartThread(ts, name, ThreadPriority.Normal, true, false, log:log); } /// /// Marks the current thread as alive /// public static void UpdateThread() { UpdateThread(Thread.CurrentThread.ManagedThreadId); } /// /// Stops watchdog tracking on the current thread /// /// If true then normal events in thread removal are not logged. /// /// True if the thread was removed from the list of tracked /// threads, otherwise false /// public static bool RemoveThread(bool log = true) { return RemoveThread(Thread.CurrentThread.ManagedThreadId, log); } private static bool RemoveThread(int threadID, bool log = true) { lock (m_threads) { ThreadWatchdogInfo twi; if (m_threads.TryGetValue(threadID, out twi)) { if (log) m_log.DebugFormat( "[WATCHDOG]: Removing thread {0}, ID {1}", twi.Thread.Name, twi.Thread.ManagedThreadId); twi.Cleanup(); m_threads.Remove(threadID); return true; } else { m_log.WarnFormat( "[WATCHDOG]: Requested to remove thread with ID {0} but this is not being monitored", threadID); return false; } } } public static bool AbortThread(int threadID) { lock (m_threads) { if (m_threads.ContainsKey(threadID)) { ThreadWatchdogInfo twi = m_threads[threadID]; twi.Thread.Abort(); RemoveThread(threadID); return true; } else { return false; } } } private static void UpdateThread(int threadID) { ThreadWatchdogInfo threadInfo; // Although TryGetValue is not a thread safe operation, we use a try/catch here instead // of a lock for speed. Adding/removing threads is a very rare operation compared to // UpdateThread(), and a single UpdateThread() failure here and there won't break // anything try { if (m_threads.TryGetValue(threadID, out threadInfo)) { threadInfo.LastTick = Environment.TickCount & Int32.MaxValue; threadInfo.IsTimedOut = false; } else { m_log.WarnFormat("[WATCHDOG]: Asked to update thread {0} which is not being monitored", threadID); } } catch { } } /// /// Get currently watched threads for diagnostic purposes /// /// public static ThreadWatchdogInfo[] GetThreadsInfo() { lock (m_threads) return m_threads.Values.ToArray(); } /// /// Return the current thread's watchdog info. /// /// The watchdog info. null if the thread isn't being monitored. public static ThreadWatchdogInfo GetCurrentThreadInfo() { lock (m_threads) { if (m_threads.ContainsKey(Thread.CurrentThread.ManagedThreadId)) return m_threads[Thread.CurrentThread.ManagedThreadId]; } return null; } /// /// Check watched threads. Fire alarm if appropriate. /// /// /// private static void WatchdogTimerElapsed(object sender, System.Timers.ElapsedEventArgs e) { int now = Environment.TickCount & Int32.MaxValue; int msElapsed = now - LastWatchdogThreadTick; if (msElapsed > WATCHDOG_INTERVAL_MS * 2) m_log.WarnFormat( "[WATCHDOG]: {0} ms since Watchdog last ran. Interval should be approximately {1} ms", msElapsed, WATCHDOG_INTERVAL_MS); LastWatchdogThreadTick = Environment.TickCount & Int32.MaxValue; Action callback = OnWatchdogTimeout; if (callback != null) { List callbackInfos = null; lock (m_threads) { foreach (ThreadWatchdogInfo threadInfo in m_threads.Values) { if (threadInfo.Thread.ThreadState == ThreadState.Stopped) { RemoveThread(threadInfo.Thread.ManagedThreadId); if (callbackInfos == null) callbackInfos = new List(); callbackInfos.Add(threadInfo); } else if (!threadInfo.IsTimedOut && now - threadInfo.LastTick >= threadInfo.Timeout) { threadInfo.IsTimedOut = true; if (threadInfo.AlarmIfTimeout) { if (callbackInfos == null) callbackInfos = new List(); // Send a copy of the watchdog info to prevent race conditions where the watchdog // thread updates the monitoring info after an alarm has been sent out. callbackInfos.Add(new ThreadWatchdogInfo(threadInfo)); } } } } if (callbackInfos != null) foreach (ThreadWatchdogInfo callbackInfo in callbackInfos) callback(callbackInfo); } if (MemoryWatchdog.Enabled) MemoryWatchdog.Update(); ChecksManager.CheckChecks(); StatsManager.RecordStats(); m_watchdogTimer.Start(); } /// /// Run a job. /// /// /// This differs from direct scheduling (e.g. Util.FireAndForget) in that a job can be run in the job /// engine if it is running, where all jobs are currently performed in sequence on a single thread. This is /// to prevent observed overload and server freeze problems when there are hundreds of connections which all attempt to /// perform work at once (e.g. in conference situations). With lower numbers of connections, the small /// delay in performing jobs in sequence rather than concurrently has not been notiecable in testing, though a future more /// sophisticated implementation could perform jobs concurrently when the server is under low load. /// /// However, be advised that some callers of this function rely on all jobs being performed in sequence if any /// jobs are performed in sequence (i.e. if jobengine is active or not). Therefore, expanding the jobengine /// beyond a single thread will require considerable thought. /// /// Also, any jobs submitted must be guaranteed to complete within a reasonable timeframe (e.g. they cannot /// incorporate a network delay with a long timeout). At the moment, work that could suffer such issues /// should still be run directly with RunInThread(), Util.FireAndForget(), etc. This is another area where /// the job engine could be improved and so CPU utilization improved by better management of concurrency within /// OpenSimulator. /// /// General classification for the job (e.g. "RezAttachments"). /// Callback for job. /// Specific name of job (e.g. "RezAttachments for Joe Bloggs" /// Object to pass to callback when run /// If set to true then the job may be run in ths calling thread. /// If the true then the job must never timeout. /// If set to true then extra logging is performed. public static void RunJob( string jobType, WaitCallback callback, string name, object obj, bool canRunInThisThread = false, bool mustNotTimeout = false, bool log = false) { if (Util.FireAndForgetMethod == FireAndForgetMethod.RegressionTest) { Culture.SetCurrentCulture(); callback(obj); return; } if (JobEngine.IsRunning) JobEngine.QueueRequest(name, callback, obj); else if (canRunInThisThread) callback(obj); else if (mustNotTimeout) RunInThread(callback, name, obj, log); else Util.FireAndForget(callback, obj, name); } } }