PDA

View Full Version : [Not Resolved] LSWS 4.1 - Spinlock


MikeDVB
04-23-2011, 01:00 AM
On LSWS 4.1 we're having LiteSpeed spinlock and stop serving requests, I did an strace of the process while this was happening but didn't manage to be able to force a core dump before this somehow resolved itself.

Here's some of the strace (it's pages long of the same thing):
epoll_wait(7, {{EPOLLIN, {u32=0, u64=532575944704}}, {EPOLLIN, {u32=0, u64=627065225216}}, {EPOLLIN, {u32=0, u64=382252089344}}, {EPOLLIN, {u32=0, u64=1533303324672}}, {EPOLLIN, {u32=0, u64=399431958528}}, {EPOLLIN, {u32=0, u64=528280977408}}, {EPOLLIN, {u32=0, u64=365072220160}}, {EPOLLIN, {u32=0, u64=176093659136}}, {EPOLLIN, {u32=0, u64=691489734656}}, {EPOLLIN, {u32=0, u64=708669603840}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=81604378624}}, {EPOLLIN, {u32=0, u64=373662154752}}, {EPOLLIN, {u32=0, u64=571230650368}}, {EPOLLIN, {u32=0, u64=47244640256}}, {EPOLLIN, {u32=0, u64=201863462912}}, {EPOLLIN, {u32=0, u64=1490353651712}}, {EPOLLIN, {u32=0, u64=141733920768}}, {EPOLLIN, {u32=0, u64=657129996288}}, {EPOLLIN, {u32=0, u64=111669149696}}, {EPOLLIN, {u32=0, u64=287762808832}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=193273528320}}, {EPOLLIN, {u32=0, u64=545460846592}}, {EPOLLIN, {u32=0, u64=536870912000}}, {EPOLLIN, {u32=0, u64=107374182400}}, {EPOLLIN, {u32=0, u64=588410519552}}, {EPOLLIN, {u32=0, u64=639950127104}}, {EPOLLIN, {u32=0, u64=1267015352320}}, {EPOLLIN, {u32=0, u64=584115552256}}, {EPOLLIN, {u32=0, u64=579820584960}}, {EPOLLIN, {u32=0, u64=743029342208}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=554050781184}}, {EPOLLIN, {u32=0, u64=459561500672}}, {EPOLLIN, {u32=0, u64=1249835483136}}, {EPOLLIN, {u32=0, u64=115964116992}}, {EPOLLIN, {u32=0, u64=614180323328}}, {EPOLLIN, {u32=0, u64=296352743424}}, {EPOLLIN, {u32=0, u64=210453397504}}, {EPOLLIN, {u32=0, u64=1597727834112}}, {EPOLLIN, {u32=0, u64=236223201280}}, {EPOLLOUT, {u32=0, u64=9590661971968}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=700079669248}}, {EPOLLIN, {u32=0, u64=506806140928}}, {EPOLLIN, {u32=0, u64=442381631488}}, {EPOLLIN, {u32=0, u64=682899800064}}, {EPOLLIN, {u32=0, u64=674309865472}}, {EPOLLIN, {u32=0, u64=425201762304}}, {EPOLLIN, {u32=0, u64=330712481792}}, {EPOLLIN, {u32=0, u64=562640715776}}, {EPOLLIN, {u32=0, u64=98784247808}}, {EPOLLIN, {u32=0, u64=631360192512}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=691489734656}}, {EPOLLIN, {u32=0, u64=176093659136}}, {EPOLLIN, {u32=0, u64=365072220160}}, {EPOLLIN, {u32=0, u64=528280977408}}, {EPOLLIN, {u32=0, u64=399431958528}}, {EPOLLIN, {u32=0, u64=1533303324672}}, {EPOLLIN, {u32=0, u64=382252089344}}, {EPOLLIN, {u32=0, u64=627065225216}}, {EPOLLIN, {u32=0, u64=532575944704}}, {EPOLLIN, {u32=0, u64=279172874240}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=111669149696}}, {EPOLLIN, {u32=0, u64=657129996288}}, {EPOLLIN, {u32=0, u64=141733920768}}, {EPOLLIN, {u32=0, u64=1490353651712}}, {EPOLLIN, {u32=0, u64=201863462912}}, {EPOLLIN, {u32=0, u64=47244640256}}, {EPOLLIN, {u32=0, u64=571230650368}}, {EPOLLIN, {u32=0, u64=373662154752}}, {EPOLLIN, {u32=0, u64=81604378624}}, {EPOLLIN, {u32=0, u64=708669603840}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=579820584960}}, {EPOLLIN, {u32=0, u64=584115552256}}, {EPOLLIN, {u32=0, u64=1267015352320}}, {EPOLLIN, {u32=0, u64=639950127104}}, {EPOLLIN, {u32=0, u64=588410519552}}, {EPOLLIN, {u32=0, u64=107374182400}}, {EPOLLIN, {u32=0, u64=536870912000}}, {EPOLLIN, {u32=0, u64=545460846592}}, {EPOLLIN, {u32=0, u64=193273528320}}, {EPOLLIN, {u32=0, u64=287762808832}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=236223201280}}, {EPOLLIN, {u32=0, u64=1597727834112}}, {EPOLLIN, {u32=0, u64=210453397504}}, {EPOLLIN, {u32=0, u64=296352743424}}, {EPOLLIN, {u32=0, u64=614180323328}}, {EPOLLIN, {u32=0, u64=115964116992}}, {EPOLLIN, {u32=0, u64=1249835483136}}, {EPOLLIN, {u32=0, u64=459561500672}}, {EPOLLIN, {u32=0, u64=554050781184}}, {EPOLLIN, {u32=0, u64=743029342208}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=98784247808}}, {EPOLLIN, {u32=0, u64=562640715776}}, {EPOLLIN, {u32=0, u64=330712481792}}, {EPOLLIN, {u32=0, u64=425201762304}}, {EPOLLIN, {u32=0, u64=674309865472}}, {EPOLLIN, {u32=0, u64=682899800064}}, {EPOLLIN, {u32=0, u64=442381631488}}, {EPOLLIN, {u32=0, u64=506806140928}}, {EPOLLIN, {u32=0, u64=700079669248}}, {EPOLLOUT, {u32=0, u64=9590661971968}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=532575944704}}, {EPOLLIN, {u32=0, u64=627065225216}}, {EPOLLIN, {u32=0, u64=382252089344}}, {EPOLLIN, {u32=0, u64=1533303324672}}, {EPOLLIN, {u32=0, u64=399431958528}}, {EPOLLIN, {u32=0, u64=528280977408}}, {EPOLLIN, {u32=0, u64=365072220160}}, {EPOLLIN, {u32=0, u64=176093659136}}, {EPOLLIN, {u32=0, u64=691489734656}}, {EPOLLIN, {u32=0, u64=631360192512}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=81604378624}}, {EPOLLIN, {u32=0, u64=373662154752}}, {EPOLLIN, {u32=0, u64=571230650368}}, {EPOLLIN, {u32=0, u64=47244640256}}, {EPOLLIN, {u32=0, u64=201863462912}}, {EPOLLIN, {u32=0, u64=1490353651712}}, {EPOLLIN, {u32=0, u64=141733920768}}, {EPOLLIN, {u32=0, u64=657129996288}}, {EPOLLIN, {u32=0, u64=111669149696}}, {EPOLLIN, {u32=0, u64=279172874240}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=193273528320}}, {EPOLLIN, {u32=0, u64=545460846592}}, {EPOLLIN, {u32=0, u64=536870912000}}, {EPOLLIN, {u32=0, u64=107374182400}}, {EPOLLIN, {u32=0, u64=588410519552}}, {EPOLLIN, {u32=0, u64=639950127104}}, {EPOLLIN, {u32=0, u64=1267015352320}}, {EPOLLIN, {u32=0, u64=584115552256}}, {EPOLLIN, {u32=0, u64=579820584960}}, {EPOLLIN, {u32=0, u64=708669603840}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=554050781184}}, {EPOLLIN, {u32=0, u64=459561500672}}, {EPOLLIN, {u32=0, u64=1249835483136}}, {EPOLLIN, {u32=0, u64=115964116992}}, {EPOLLIN, {u32=0, u64=614180323328}}, {EPOLLIN, {u32=0, u64=296352743424}}, {EPOLLIN, {u32=0, u64=210453397504}}, {EPOLLIN, {u32=0, u64=1597727834112}}, {EPOLLIN, {u32=0, u64=236223201280}}, {EPOLLIN, {u32=0, u64=287762808832}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=700079669248}}, {EPOLLIN, {u32=0, u64=506806140928}}, {EPOLLIN, {u32=0, u64=442381631488}}, {EPOLLIN, {u32=0, u64=682899800064}}, {EPOLLIN, {u32=0, u64=674309865472}}, {EPOLLIN, {u32=0, u64=425201762304}}, {EPOLLIN, {u32=0, u64=330712481792}}, {EPOLLIN, {u32=0, u64=562640715776}}, {EPOLLIN, {u32=0, u64=98784247808}}, {EPOLLIN, {u32=0, u64=743029342208}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=691489734656}}, {EPOLLIN, {u32=0, u64=176093659136}}, {EPOLLIN, {u32=0, u64=365072220160}}, {EPOLLIN, {u32=0, u64=528280977408}}, {EPOLLIN, {u32=0, u64=399431958528}}, {EPOLLIN, {u32=0, u64=1533303324672}}, {EPOLLIN, {u32=0, u64=382252089344}}, {EPOLLIN, {u32=0, u64=627065225216}}, {EPOLLIN, {u32=0, u64=532575944704}}, {EPOLLOUT, {u32=0, u64=9590661971968}}}, 10, 100) = 10
epoll_wait(7, {{EPOLLIN, {u32=0, u64=111669149696}}, {EPOLLIN, {u32=0, u64=657129996288}}, {EPOLLIN, {u32=0, u64=141733920768}}, {EPOLLIN, {u32=0, u64=1490353651712}}, {EPOLLIN, {u32=0, u64=201863462912}}, {EPOLLIN, {u32=0, u64=47244640256}}, {EPOLLIN, {u32=0, u64=571230650368}}, {EPOLLIN, {u32=0, u64=373662154752}}, {EPOLLIN, {u32=0, u64=81604378624}}, {EPOLLIN, {u32=0, u64=631360192512}}}, 10, 100) = 10

It's done this three times tonight, with me only having the presence of mind to strace the process the last time around thanks to Tony at HawkHost. I fully expect it to happen again at which point I'm going to force a core dump and get it to bug@

I have had this happen on another server yesterday as well but it only happened once and hasn't happened since on that server and the hardware/software is identical.

MikeDVB
04-23-2011, 01:09 AM
It happened again, and I forced a signal 11 core dump and have gotten it to bug@.

Looks to be an issue with epoll?

Reading symbols from /usr/local/lsws/bin/lshttpd...(no debugging symbols found)...done.

warning: core file may not match specified executable file.
[New Thread 7014]
[New Thread 7013]
Reading symbols from /lib64/libpthread.so.0...(no debugging symbols found)...done.
Loaded symbols for /lib64/libpthread.so.0
Reading symbols from /lib64/libm.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib64/libm.so.6
Reading symbols from /lib64/libc.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib64/libc.so.6
Reading symbols from /lib64/libcrypt.so.1...(no debugging symbols found)...done.
Loaded symbols for /lib64/libcrypt.so.1
Reading symbols from /lib64/libdl.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/libdl.so.2
Reading symbols from /lib64/ld-linux-x86-64.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/ld-linux-x86-64.so.2
Reading symbols from /lib64/libnss_files.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/libnss_files.so.2
Core was generated by `litespeed'.
Program terminated with signal 11, Segmentation fault.
#0 0x0000003e820d0f69 in syscall () from /lib64/libc.so.6
(gdb) bt
#0 0x0000003e820d0f69 in syscall () from /lib64/libc.so.6
#1 0x00000000004f679d in epoll::waitAndProcessEvents (this=0xa27cf10, iTimeoutMilliSec=100) at /httpd/edio/epoll.cpp:229
#2 0x000000000044f280 in EventDispatcher::run (this=0xa23e218) at /httpd/http/eventdispatcher.cpp:225
#3 0x000000000040fca5 in HttpServerImpl::start (this=0xa23e1f0) at /httpd/main/httpserver.cpp:475
#4 0x0000000000412cab in HttpServer::start (this=0x8772d0) at /httpd/main/httpserver.cpp:1849
#5 0x0000000000409eee in LshttpdMain::main (this=0xa23de30, argc=1, argv=0x7fff43247638) at /httpd/main/lshttpdmain.cpp:1761
#6 0x0000000000405a3f in main (argc=1, argv=0x7fff43247638) at /httpd/main.cpp:121

I've had to go to 4.1RC4 as this keeps happening on the release/debug version of 4.1.

MikeDVB
04-23-2011, 01:59 AM
I've been forced to roll back to 4.0.20 as 4.1RC4 and 4.1 are exhibiting this behavior on two separate systems.

MikeDVB
04-24-2011, 10:01 PM
George has gotten us a new debug build of the software as he believes he's resolved this issue. Apparently it's due to the incorrect accounting of concurrent connections for a vhost but we'll see as it shouldn't take long to find out if this fixed the issue or not.

MikeDVB
04-26-2011, 12:56 AM
Enough time has passed since George has gotten us the new build which resolved the concurrent connection bug that resolved our spinlock that I feel this can be marked resolved.

MikeDVB
04-28-2011, 12:43 PM
This isn't resolved, it just happened on another server.

Strace:
epoll_wait(8, {{EPOLLIN, {u32=0, u64=1082331758592}}, {EPOLLIN, {u32=0, u64=1099511627776}}, {EPOLLIN, {u32=0, u64=601295421440}}, {EPOLLIN, {u32=0, u64=146028888064}}, {EPOLLIN, {u32=0, u64=515396075520}}, {EPOLLIN, {u32=0, u64=455266533376}}, {EPOLLIN, {u32=0, u64=635655159808}}, {EPOLLIN, {u32=0, u64=566935683072}}, {EPOLLIN, {u32=0, u64=721554505728}}, {EPOLLIN, {u32=0, u64=910533066752}}}, 10, 100) = 10
epoll_wait(8, {{EPOLLIN, {u32=0, u64=68719476736}}, {EPOLLIN, {u32=0, u64=60129542144}}, {EPOLLIN, {u32=0, u64=51539607552}}, {EPOLLIN, {u32=0, u64=687194767360}}, {EPOLLIN, {u32=0, u64=987842478080}}, {EPOLLIN, {u32=0, u64=996432412672}}, {EPOLLIN, {u32=0, u64=678604832768}}, {EPOLLIN, {u32=0, u64=1120986464256}}, {EPOLLIN, {u32=0, u64=919123001344}}, {EPOLLIN, {u32=0, u64=695784701952}}}, 10, 100) = 10
epoll_wait(8, {{EPOLLIN, {u32=0, u64=644245094400}}, {EPOLLIN, {u32=0, u64=618475290624}}, {EPOLLIN, {u32=0, u64=609885356032}}, {EPOLLIN, {u32=0, u64=798863917056}}, {EPOLLIN, {u32=0, u64=532575944704}}, {EPOLLIN, {u32=0, u64=970662608896}}, {EPOLLIN, {u32=0, u64=773094113280}}, {EPOLLIN, {u32=0, u64=893353197568}}, {EPOLLIN, {u32=0, u64=292057776128}}, {EPOLLIN, {u32=0, u64=979252543488}}}, 10, 100) = 10

Backtrace:
[New Thread 23136]
[New Thread 23135]
Core was generated by `litespeed'.
Program terminated with signal 11, Segmentation fault.
#0 0x00000038ede0b150 in pthread_cond_timedwait@@GLIBC_2.3.2 ()
from /lib64/libpthread.so.0
#0 0x00000038ede0b150 in pthread_cond_timedwait@@GLIBC_2.3.2 ()
from /lib64/libpthread.so.0
#1 0x00000000004f77f7 in PThreadCond::wait (this=0x571fcb8,
pMutex=0x571fc90, lMilliSec=1000)
at /home/gwang/release/httpd/httpd/thread/pthread/pthreadcond.cpp:33
#2 0x000000000041665b in PThreadWorkQueue<BlockOpReq>::get (this=0x571fc90,
pWork=0x42ec3db0, size=@0x42ec3dac, lMilliSec=1000)
at /home/gwang/release/httpd/httpd/thread/pthread/pthreadworkqueue.h:147
#3 0x0000000000416563 in TWorkQueue<BlockOpReq, PThreadWorkQueue<BlockOpReq> >::get (this=0x571fc90, pWork=0x42ec3db0, size=@0x42ec3dac, lMilliSec=1000)
at /home/gwang/release/httpd/httpd/thread/workqueue.h:37
#4 0x00000000004163da in WorkCrew<BlockOpReq, BlockOpDoWork, WorkQueue<BlockOpReq> >::TCrewMember<BlockOpReq, BlockOpDoWork, WorkQueue<BlockOpReq> >::run (
this=0x5709030) at /home/gwang/release/httpd/httpd/thread/workcrew.h:307
#5 0x000000000041631c in PThreadImpl<WorkCrew<BlockOpReq, BlockOpDoWork, WorkQueue<BlockOpReq> >::TCrewMember<BlockOpReq, BlockOpDoWork, WorkQueue<BlockOpReq> > >::threadRoutine (arg=0x5709030)
at /home/gwang/release/httpd/httpd/thread/pthread/pthreadimpl.h:98
#6 0x00000038ede0673d in start_thread () from /lib64/libpthread.so.0
#7 0x00000038ed6d44bd in clone () from /lib64/libc.so.6

MikeDVB
04-28-2011, 06:14 PM
I am wondering what the issue is, I didn't manage to get in to do a netstat fast enough but looking at the graduality of the curve and how quickly it drops off, it looks more like an internal counting issue than an actual site being under attack.

The next time it happens (I am watching manually for hours) I will do a netstat to see if I can verify the traffic.

http://www.screen-shot.net/2011-04-28_1918-1.png
http://www.screen-shot.net/2011-04-28_1920.png

During that same timeframe also, there was no increase in packet flow or traffic on the switch port for the server, which leads me to believe, it's a LSWS issue:
The monitoring server is an hour behind the web server so 1918 on the server is 1818 on the monitoring:
http://www.screen-shot.net/2011-04-28_1931.png
http://www.screen-shot.net/2011-04-28_1932.png

JLHC
05-08-2011, 06:40 AM
Any update to this, Mike?

MikeDVB
05-08-2011, 12:17 PM
Whatever the issue is that is causing it to spin up to 10,000 "concurrent connections" (when no actual connections above 150 exist) still exists and I haven't had time just yet to do what needs to be done to diagnose the cause on our end. We simply, for the time being, set the connection limit to 50k so that when it does have the issue it doesn't actually fail everybody out due to being "out of connections".

andreas
05-11-2011, 07:20 AM
Same problem here, today for the first time:

gettimeofday({1305123558, 321218}, NULL) = 0
epoll_wait(6, {{EPOLLIN, {u32=10, u64=10}}, {EPOLLIN, {u32=11, u64=11}}}, 10, 100) = 2
gettimeofday({1305123558, 321264}, NULL) = 0
epoll_wait(6, {{EPOLLIN, {u32=10, u64=10}}, {EPOLLIN, {u32=11, u64=11}}}, 10, 100) = 2
gettimeofday({1305123558, 321311}, NULL) = 0
epoll_wait(6, {{EPOLLIN, {u32=10, u64=10}}, {EPOLLIN, {u32=11, u64=11}}}, 10, 100) = 2
gettimeofday({1305123558, 321357}, NULL) = 0
epoll_wait(6, {{EPOLLIN, {u32=10, u64=10}}, {EPOLLIN, {u32=11, u64=11}}}, 10, 100) = 2
gettimeofday({1305123558, 321403}, NULL) = 0
epoll_wait(6, {{EPOLLIN, {u32=10, u64=10}}, {EPOLLIN, {u32=11, u64=11}}}, 10, 100) = 2
gettimeofday({1305123558, 321448}, NULL) = 0
epoll_wait(6, {{EPOLLIN, {u32=10, u64=10}}, {EPOLLIN, {u32=11, u64=11}}}, 10, 100) = 2
gettimeofday({1305123558, 321495}, NULL) = 0
epoll_wait(6, {{EPOLLIN, {u32=10, u64=10}}, {EPOLLIN, {u32=11, u64=11}}}, 10, 100) = 2
gettimeofday({1305123558, 321541}, NULL) = 0
epoll_wait(6, {{EPOLLIN, {u32=10, u64=10}}, {EPOLLIN, {u32=11, u64=11}}}, 10, 100) = 2
gettimeofday({1305123558, 321587}, NULL) = 0
epoll_wait(6, {{EPOLLIN, {u32=10, u64=10}}, {EPOLLIN, {u32=11, u64=11}}}, 10, 100) = 2
gettimeofday({1305123558, 321634}, NULL) = 0
epoll_wait(6, {{EPOLLIN, {u32=10, u64=10}}, {EPOLLIN, {u32=11, u64=11}}}, 10, 100) = 2
gettimeofday({1305123558, 321680}, NULL) = 0
epoll_wait(6, {{EPOLLIN, {u32=10, u64=10}}, {EPOLLIN, {u32=11, u64=11}}}, 10, 100) = 2
gettimeofday({1305123558, 321725}, NULL) = 0
epoll_wait(6, {{EPOLLIN, {u32=10, u64=10}}, {EPOLLIN, {u32=11, u64=11}}}, 10, 100) = 2
gettimeofday({1305123558, 321772}, NULL) = 0

andreas
05-11-2011, 07:38 AM
Keeps happening constantly now... back to 4.0.20.

MikeDVB
05-11-2011, 03:27 PM
Indeed, the only way I've found to prevent the spinlock is to set the maximum connections (http) to something like 50k and then the spinlock never happens. On 4.0 you could set it to something conservative like 2k (the default) and you would be fine but if you upgrade to 4.1 and leave it at 2k you'll inevitably hit this spinlock eventually.

Pipert
05-12-2011, 12:01 AM
Indeed, the only way I've found to prevent the spinlock is to set the maximum connections (http) to something like 50k and then the spinlock never happens. On 4.0 you could set it to something conservative like 2k (the default) and you would be fine but if you upgrade to 4.1 and leave it at 2k you'll inevitably hit this spinlock eventually.

I've had Litespeed 4.1 randomly crash on me a couple of times. I've now downgraded to 4.0.20 and I haven't had any issues with it.

MikeDVB
05-12-2011, 12:17 AM
I've had Litespeed 4.1 randomly crash on me a couple of times. I've now downgraded to 4.0.20 and I haven't had any issues with it.

When it crashed, did you have to manually intervene? The issue with this spin lock is that it doesn't kill itself off and start a new process automatically. It will sit and do nothing but spin lock for 5 to 15 minutes before it gets a signal 11.

MikeDVB
05-12-2011, 11:20 AM
On the latest debug version I'm getting a lot of this:
root@echo [/tmp/lshttpd/bak_core]# gdb /usr/local/lsws/bin/lshttpd core.829851
GNU gdb (GDB) Red Hat Enterprise Linux (7.0.1-32.el5_6.2)
Copyright (C) 2009 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /usr/local/lsws/bin/lshttpd...(no debugging symbols found)...done.

warning: core file may not match specified executable file.
[New Thread 829853]
[New Thread 829852]
Reading symbols from /lib64/ld-linux-x86-64.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/ld-linux-x86-64.so.2
Failed to read a valid object file image from memory.
Core was generated by `litespeed'.
Program terminated with signal 11, Segmentation fault.
#0 0x00000036f867c2e3 in ?? ()
(gdb) bt
Cannot access memory at address 0x7fffaeabaf98

The one from yesterday is the same:

root@echo [/tmp/lshttpd/bak_core]# gdb /usr/local/lsws/bin/lshttpd core.624673
GNU gdb (GDB) Red Hat Enterprise Linux (7.0.1-32.el5_6.2)
Copyright (C) 2009 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>...
Reading symbols from /usr/local/lsws/bin/lshttpd...(no debugging symbols found)...done.

warning: core file may not match specified executable file.
[New Thread 624678]
[New Thread 624677]
Reading symbols from /lib64/ld-linux-x86-64.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib64/ld-linux-x86-64.so.2
Failed to read a valid object file image from memory.
Core was generated by `litespeed'.
Program terminated with signal 11, Segmentation fault.
#0 0x00000036f867c321 in ?? ()
(gdb) bt
Cannot access memory at address 0x7fff48bf21b8

JLHC
05-20-2011, 10:22 PM
Does version 4.1.1 resolve this issue?

MikeDVB
05-20-2011, 11:17 PM
Does version 4.1.1 resolve this issue?

I'm not sure, waiting on George to chime in (either here, or via email) and let me know whether or not this issue has been resolved.

JLHC
05-20-2011, 11:32 PM
I'm not sure, waiting on George to chime in (either here, or via email) and let me know whether or not this issue has been resolved.
Make sure to let me know as well Mike. ;)

MikeDVB
05-21-2011, 11:18 AM
Make sure to let me know as well Mike. ;)

I haven't heard anything either way.

mistwang
05-22-2011, 06:45 PM
We fixed a few bugs that cause 4.1 crash, however, for this particular one, we still need a more detail report running LSWS under valgrind.

JLHC
06-03-2011, 04:59 AM
We fixed a few bugs that cause 4.1 crash, however, for this particular one, we still need a more detail report running LSWS under valgrind.
Any update to this yet? Planning to upgrade to version 4.1.x but this issue is still holding us back for now. :(

JLHC
06-14-2011, 09:03 PM
I can see that version 4.1.1 has been released. Is this solved in the new version?

JLHC
07-16-2011, 01:32 AM
I can see that version 4.1.2 has been released. Is this solved in the new version?

MikeDVB
07-16-2011, 01:34 AM
I haven't run into the issue in a while, but the issue only occurred when the maximum connections was set to something around 1k~9k. We have it set to something like 50k on each server which has totally prevented the issue from causing issues, not necessarily from occurring in the first place.

The only way for me to know for sure is to drop the servers back to a limit of 5k simultaneous connections and then wait a few days.

JLHC
07-16-2011, 01:39 AM
I haven't run into the issue in a while, but the issue only occurred when the maximum connections was set to something around 1k~9k. We have it set to something like 50k on each server which has totally prevented the issue from causing issues, not necessarily from occurring in the first place.

The only way for me to know for sure is to drop the servers back to a limit of 5k simultaneous connections and then wait a few days.
Well that will be troublesome for you. :(

I hope LiteSpeed Tech's staffs can chime in to clarify this. They didn't say anything about this in the Version History / Release Log (http://www.litespeedtech.com/litespeed-web-server-release-log.html).

JLHC
08-02-2011, 11:25 PM
I can see that version 4.1.3 has been released. Is this solved in the new version?

bbb177
09-29-2011, 02:06 PM
The issue is still present. I'm using the newest version and it "hangs" at least once or twice a day. Are you at least working on a solution? I'm paying for a server that doesn't work like it should. And going back to 4.020 is not an option because I need the mp4 streaming. That's the main reason that led me to buying a license. I really need to know when the issue will be resolved.

NiteWave
09-29-2011, 07:28 PM
we'd like to solve such issues as soon as possible, the problem is it's hard to reproduce.

have you tried MikeDVB's suggestion:
I haven't run into the issue in a while, but the issue only occurred when the maximum connections was set to something around 1k~9k. We have it set to something like 50k on each server which has totally prevented the issue from causing issues, not necessarily from occurring in the first place.

bbb177
09-30-2011, 01:16 AM
we'd like to solve such issues as soon as possible, the problem is it's hard to reproduce.

have you tried MikeDVB's suggestion:

Yes, and it doesn't work, the server still "hangs". As far as I know my hosting sent bug reports to you, so you should have detailed knowledge how to reproduce it.

jackc
01-17-2012, 04:32 PM
I'm seeing the same problem too, with the latest 4.1.10, didn't have this problem before with heavier server load and older version of lsws, really weird.
when the problem occured litespeed is using 100 percent cpu. also seeing possibly memory leak, litespeed is using alot more memory than usual over time.

some strace below

epoll_ctl(7, EPOLL_CTL_DEL, 59, {0, {u32=59, u64=59}}) = 0
close(59) = 0
epoll_ctl(7, EPOLL_CTL_DEL, 78, {0, {u32=78, u64=78}}) = 0
close(78) = 0
epoll_ctl(7, EPOLL_CTL_DEL, 115, {0, {u32=115, u64=115}}) = 0
close(115) = 0
recvfrom(136, 0x7fff2ba1daef, 1, 2, 0, 0) = -1 EAGAIN (Resource temporarily unavailable)
epoll_ctl(7, EPOLL_CTL_DEL, 174, {0, {u32=174, u64=174}}) = 0
close(174) = 0
epoll_ctl(7, EPOLL_CTL_DEL, 175, {0, {u32=175, u64=175}}) = 0
close(175) = 0
epoll_ctl(7, EPOLL_CTL_DEL, 256, {0, {u32=256, u64=256}}) = 0
close(256) = 0
epoll_ctl(7, EPOLL_CTL_DEL, 265, {0, {u32=265, u64=265}}) = 0
close(265) = 0
epoll_ctl(7, EPOLL_CTL_DEL, 266, {0, {u32=266, u64=266}}) = 0
close(266) = 0
epoll_ctl(7, EPOLL_CTL_DEL, 268, {0, {u32=268, u64=268}}) = 0
close(268) = 0
epoll_ctl(7, EPOLL_CTL_DEL, 272, {0, {u32=272, u64=272}}) = 0
close(272) = 0
epoll_ctl(7, EPOLL_CTL_DEL, 278, {0, {u32=278, u64=278}}) = 0
close(278) = 0
epoll_ctl(7, EPOLL_CTL_DEL, 279, {0, {u32=279, u64=279}}) = 0
close(279) = 0
epoll_ctl(7, EPOLL_CTL_DEL, 288, {0, {u32=288, u64=288}}) = 0
close(288) = 0
epoll_ctl(7, EPOLL_CTL_DEL, 393, {0, {u32=393, u64=393}}) = 0
close(393) = 0
epoll_ctl(7, EPOLL_CTL_DEL, 425, {0, {u32=425, u64=425}}) = 0
close(425) = 0

mistwang
01-18-2012, 02:33 PM
which version of LSWS were you using? If you switch back to the older version, will the problem disappear?