From b13214af273f826699e97bd1f4dd2e49bfe43f5e Mon Sep 17 00:00:00 2001 From: Oren Hurvitz Date: Tue, 14 Jan 2014 09:55:19 +0200 Subject: [PATCH] Added a watchdog to abort slow threads in the main thread pool If a task in the thread pool hangs then it will permanently take up one of the threads. If this happens repeatedly then the thread pool will become exhausted and OpenSim will fail. This watchdog terminates threads in the thread pool that have been executing for over a minute (which probably means they're hung). Resolves http://opensimulator.org/mantis/view.php?id=6945 --- OpenSim/Framework/Util.cs | 282 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 270 insertions(+), 12 deletions(-) diff --git a/OpenSim/Framework/Util.cs b/OpenSim/Framework/Util.cs index 6be172246c..2de596e8f5 100644 --- a/OpenSim/Framework/Util.cs +++ b/OpenSim/Framework/Util.cs @@ -51,6 +51,7 @@ using Nwc.XmlRpc; using OpenMetaverse; using OpenMetaverse.StructuredData; using Amib.Threading; +using System.Collections.Concurrent; namespace OpenSim.Framework { @@ -148,6 +149,9 @@ namespace OpenSim.Framework /// private static SmartThreadPool m_ThreadPool; + // Watchdog timer that aborts threads that have timed-out + private static Timer m_threadPoolWatchdog; + // Unix-epoch starts at January 1st 1970, 00:00:00 UTC. And all our times in the server are (or at least should be) in UTC. public static readonly DateTime UnixEpoch = DateTime.ParseExact("1970-01-01 00:00:00 +0", "yyyy-MM-dd hh:mm:ss z", DateTimeFormatInfo.InvariantInfo).ToUniversalTime(); @@ -1883,6 +1887,7 @@ namespace OpenSim.Framework startInfo.MinWorkerThreads = minThreads; m_ThreadPool = new SmartThreadPool(startInfo); + m_threadPoolWatchdog = new Timer(ThreadPoolWatchdog, null, 0, 1000); } public static int FireAndForgetCount() @@ -1907,16 +1912,117 @@ namespace OpenSim.Framework } + /// + /// Additional information about threads in the main thread pool. Used to time how long the + /// thread has been running, and abort it if it has timed-out. + /// + private class ThreadInfo + { + public long ThreadFuncNum { get; set; } + public string StackTrace { get; set; } + private string context; + + public IWorkItemResult WorkItem { get; set; } + public Thread Thread { get; set; } + public bool Running { get; set; } + public bool Aborted { get; set; } + private int started; + + public ThreadInfo(long threadFuncNum, string context) + { + ThreadFuncNum = threadFuncNum; + this.context = context; + Thread = null; + Running = false; + Aborted = false; + } + + public void Started() + { + Thread = Thread.CurrentThread; + started = EnvironmentTickCount(); + Running = true; + } + + public void Ended() + { + Running = false; + } + + public int Elapsed() + { + return EnvironmentTickCountSubtract(started); + } + + public void Abort() + { + Aborted = true; + WorkItem.Cancel(true); + } + + /// + /// Returns the thread's stack trace. + /// + /// + /// May return one of two stack traces. First, tries to get the thread's active stack + /// trace. But this can fail, so as a fallback this method will return the stack + /// trace that was active when the task was queued. + /// + public string GetStackTrace() + { + string ret = (context == null) ? "" : ("(" + context + ") "); + + StackTrace activeStackTrace = Util.GetStackTrace(Thread); + if (activeStackTrace != null) + ret += activeStackTrace.ToString(); + else if (StackTrace != null) + ret += "(Stack trace when queued) " + StackTrace; + // else, no stack trace available + + return ret; + } + } + + private static long nextThreadFuncNum = 0; private static long numQueuedThreadFuncs = 0; private static long numRunningThreadFuncs = 0; private static Int32 threadFuncOverloadMode = 0; + // Maps (ThreadFunc number -> Thread) + private static ConcurrentDictionary activeThreads = new ConcurrentDictionary(); + + private static readonly int THREAD_TIMEOUT = 60 * 1000; + + /// + /// Finds threads in the main thread pool that have timed-out, and aborts them. + /// + private static void ThreadPoolWatchdog(object state) + { + foreach (KeyValuePair entry in activeThreads) + { + ThreadInfo t = entry.Value; + if (t.Running && !t.Aborted && (t.Elapsed() >= THREAD_TIMEOUT)) + { + m_log.WarnFormat("Timeout in threadfunc {0} ({1}) {2}", t.ThreadFuncNum, t.Thread.Name, t.GetStackTrace()); + t.Abort(); + + ThreadInfo dummy; + activeThreads.TryRemove(entry.Key, out dummy); + + // It's possible that the thread won't abort. To make sure the thread pool isn't + // depleted, increase the pool size. + m_ThreadPool.MaxThreads++; + } + } + } + + public static void FireAndForget(System.Threading.WaitCallback callback, object obj) { FireAndForget(callback, obj, null); } - + public static void FireAndForget(System.Threading.WaitCallback callback, object obj, string context) { WaitCallback realCallback; @@ -1924,6 +2030,7 @@ namespace OpenSim.Framework bool loggingEnabled = (DebugFlags & DebugFlagsEnum.LogThreadPool) != 0; long threadFuncNum = Interlocked.Increment(ref nextThreadFuncNum); + ThreadInfo threadInfo = new ThreadInfo(threadFuncNum, context); if (FireAndForgetMethod == FireAndForgetMethod.RegressionTest) { @@ -1939,6 +2046,8 @@ namespace OpenSim.Framework { long numQueued1 = Interlocked.Decrement(ref numQueuedThreadFuncs); long numRunning1 = Interlocked.Increment(ref numRunningThreadFuncs); + threadInfo.Started(); + activeThreads[threadFuncNum] = threadInfo; try { @@ -1949,15 +2058,22 @@ namespace OpenSim.Framework callback(o); } + catch (ThreadAbortException e) + { + m_log.Error(string.Format("Aborted threadfunc {0} ", threadFuncNum), e); + } catch (Exception e) { - m_log.Error("[UTIL]: FireAndForget thread terminated with error ", e); + m_log.Error(string.Format("[UTIL]: Util STP threadfunc {0} terminated with error ", threadFuncNum), e); } finally { Interlocked.Decrement(ref numRunningThreadFuncs); + threadInfo.Ended(); + ThreadInfo dummy; + activeThreads.TryRemove(threadFuncNum, out dummy); if (loggingEnabled || (threadFuncOverloadMode == 1)) - m_log.Debug("Exit threadfunc " + threadFuncNum); + m_log.DebugFormat("Exit threadfunc {0} ({1})", threadFuncNum, FormatDuration(threadInfo.Elapsed())); } }; } @@ -1979,10 +2095,13 @@ namespace OpenSim.Framework if (loggingEnabled || (threadFuncOverloadMode == 1)) { + string full, partial; + GetFireAndForgetStackTrace(out full, out partial); m_log.DebugFormat("Queue threadfunc {0} (Queued {1}, Running {2}) {3}{4}", threadFuncNum, numQueued, numRunningThreadFuncs, (context == null) ? "" : ("(" + context + ") "), - GetFireAndForgetStackTrace(loggingEnabled)); + loggingEnabled ? full : partial); + threadInfo.StackTrace = full; } switch (FireAndForgetMethod) @@ -2004,7 +2123,7 @@ namespace OpenSim.Framework case FireAndForgetMethod.SmartThreadPool: if (m_ThreadPool == null) InitThreadPool(2, 15); - m_ThreadPool.QueueWorkItem((cb, o) => cb(o), realCallback, obj); + threadInfo.WorkItem = m_ThreadPool.QueueWorkItem((cb, o) => cb(o), realCallback, obj); break; case FireAndForgetMethod.Thread: Thread thread = new Thread(delegate(object o) { realCallback(o); }); @@ -2017,6 +2136,8 @@ namespace OpenSim.Framework catch (Exception) { Interlocked.Decrement(ref numQueuedThreadFuncs); + ThreadInfo dummy; + activeThreads.TryRemove(threadFuncNum, out dummy); throw; } } @@ -2024,8 +2145,9 @@ namespace OpenSim.Framework /// /// Returns a stack trace for a thread added using FireAndForget(). /// - /// True: return full stack trace; False: return only the first frame - private static string GetFireAndForgetStackTrace(bool full) + /// Will contain the full stack trace + /// Will contain only the first frame of the stack trace + private static void GetFireAndForgetStackTrace(out string full, out string partial) { string src = Environment.StackTrace; string[] lines = src.Split(new string[] { Environment.NewLine }, StringSplitOptions.None); @@ -2034,6 +2156,7 @@ namespace OpenSim.Framework bool started = false; bool first = true; + partial = ""; for (int i = 0; i < lines.Length; i++) { @@ -2051,21 +2174,102 @@ namespace OpenSim.Framework { line = line.TrimStart(); first = false; + partial = line; } - bool last = (i == lines.Length - 1) || !full; + bool last = (i == lines.Length - 1); if (last) dest.Append(line); else dest.AppendLine(line); - - if (!full) - break; } - return dest.ToString(); + full = dest.ToString(); } +#pragma warning disable 0618 + /// + /// Return the stack trace of a different thread. + /// + /// + /// This is complicated because the thread needs to be paused in order to get its stack + /// trace. And pausing another thread can cause a deadlock. This method attempts to + /// avoid deadlock by using a short timeout (200ms), after which it gives up and + /// returns 'null' instead of the stack trace. + /// + /// Take from: http://stackoverflow.com/a/14935378 + /// + /// WARNING: this doesn't work in Mono. See https://bugzilla.novell.com/show_bug.cgi?id=571691 + /// + /// + /// The stack trace, or null if failed to get it + private static StackTrace GetStackTrace(Thread targetThread) + { + if (IsPlatformMono) + { + // This doesn't work in Mono + return null; + } + + ManualResetEventSlim fallbackThreadReady = new ManualResetEventSlim(); + ManualResetEventSlim exitedSafely = new ManualResetEventSlim(); + + try + { + new Thread(delegate() + { + fallbackThreadReady.Set(); + while (!exitedSafely.Wait(200)) + { + try + { + targetThread.Resume(); + } + catch (Exception) + { + // Whatever happens, do never stop to resume the main-thread regularly until the main-thread has exited safely. + } + } + }).Start(); + + fallbackThreadReady.Wait(); + // From here, you have about 200ms to get the stack-trace + + targetThread.Suspend(); + + StackTrace trace = null; + try + { + trace = new StackTrace(targetThread, true); + } + catch (ThreadStateException) + { + //failed to get stack trace, since the fallback-thread resumed the thread + //possible reasons: + //1.) This thread was just too slow + //2.) A deadlock ocurred + //Automatic retry seems too risky here, so just return null. + } + + try + { + targetThread.Resume(); + } + catch (ThreadStateException) + { + // Thread is running again already + } + + return trace; + } + finally + { + // Signal the fallack-thread to stop + exitedSafely.Set(); + } + } +#pragma warning restore 0618 + /// /// Get information about the current state of the smart thread pool. /// @@ -2149,6 +2353,60 @@ namespace OpenSim.Framework return tcA - tcB; } + /// + /// Formats a duration (given in milliseconds). + /// + public static string FormatDuration(int ms) + { + TimeSpan span = new TimeSpan(ms * TimeSpan.TicksPerMillisecond); + + string str = ""; + string suffix = null; + + int hours = (int)span.TotalHours; + if (hours > 0) + { + str += hours.ToString(str.Length == 0 ? "0" : "00"); + suffix = "hours"; + } + + if ((hours > 0) || (span.Minutes > 0)) + { + if (str.Length > 0) + str += ":"; + str += span.Minutes.ToString(str.Length == 0 ? "0" : "00"); + if (suffix == null) + suffix = "min"; + } + + if ((hours > 0) || (span.Minutes > 0) || (span.Seconds > 0)) + { + if (str.Length > 0) + str += ":"; + str += span.Seconds.ToString(str.Length == 0 ? "0" : "00"); + if (suffix == null) + suffix = "sec"; + } + + if (suffix == null) + suffix = "ms"; + + if (span.TotalMinutes < 1) + { + int ms1 = span.Milliseconds; + if (str.Length > 0) + { + ms1 /= 100; + str += "."; + } + str += ms1.ToString("0"); + } + + str += " " + suffix; + + return str; + } + /// /// Prints the call stack at any given point. Useful for debugging. ///