If application fxs files are missing, crash observed in retries

Hi,

Please help by checking the scenario and code snippet provided along with gdb logs.

Let us know if anything missing in cleanup of memory and reason for failures.

I am verifying below scenario,

  • application fxs files are loaded into confd after application connects to confd daemon

  • As fxs files are not present in confd when application tries to connect to confd, cdb_subscribe2() API is failing.

  • Application should retry by being in a while loop until respective fxs are loaded and confd restarts.

Observed errors:
Application crashes 2 times and 3rd time it is struct in confd_load_schemas() API call

1st crash : *** Error in ./eb': free(): invalid pointer: 0x00002b46ce934000 *** 2nd crash : *** Error in./eb’: malloc(): memory corruption: 0x00002b4fb59f9010 ***

Below is the trace of application when it is struck in confd_load_schemas() API call

		(gdb) bt
		#0  __lll_lock_wait_private () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:95
		#1  0x000000333ac7e14a in _L_lock_10423 () at malloc.c:5147
		#2  0x000000333ac7b985 in __GI___libc_malloc (bytes=56) at malloc.c:2852
		#3  0x000000333a40c74e in _dl_map_object_deps (map=map@entry=0x2b4fc0fb9ab8, preloads=preloads@entry=0x0, npreloads=npreloads@entry=0,
			trace_mode=trace_mode@entry=0, open_mode=open_mode@entry=-2147483648) at dl-deps.c:511
		#4  0x000000333a41292e in dl_open_worker (a=a@entry=0x2b4fc577bd08) at dl-open.c:261
		#5  0x000000333a40e6d4 in _dl_catch_error (objname=objname@entry=0x2b4fc577bcf8, errstring=errstring@entry=0x2b4fc577bd00,
			mallocedp=mallocedp@entry=0x2b4fc577bcf7, operate=operate@entry=0x333a412820 <dl_open_worker>, args=args@entry=0x2b4fc577bd08)
			at dl-error.c:177
		#6  0x000000333a4122eb in _dl_open (file=0x333ad61a46 "libgcc_s.so.1", mode=-2147483647, caller_dlopen=<optimized out>, nsid=-2, argc=2,
			argv=0x7fffd1098ff8, env=0x284a4c0) at dl-open.c:650
		#7  0x000000333ad1be72 in do_dlopen (ptr=<optimized out>) at dl-libc.c:87
		#8  0x000000333a40e6d4 in _dl_catch_error (objname=0x2b4fc577bf00, errstring=0x2b4fc577bf08, mallocedp=0x2b4fc577beff,
			operate=0x333ad1be30 <do_dlopen>, args=0x2b4fc577bf20) at dl-error.c:177
		#9  0x000000333ad1bf0f in dlerror_run (operate=<optimized out>, args=<optimized out>) at dl-libc.c:46
		#10 0x000000333ad1bf81 in __GI___libc_dlopen_mode (name=<optimized out>, mode=<optimized out>) at dl-libc.c:163
		#11 0x000000333acf5a75 in init () at ../sysdeps/x86_64/backtrace.c:52
		#12 0x00002b4fbcfc19a3 in pthread_once () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S:103
		#13 0x000000333acf5b8c in __GI___backtrace (array=<optimized out>, size=<optimized out>) at ../sysdeps/x86_64/backtrace.c:103
		#14 0x000000333ac1fe0b in backtrace_and_maps (do_abort=989484576, do_abort@entry=2, written=128, fd=2)
			at ../sysdeps/unix/sysv/linux/libc_fatal.c:49
		#15 0x000000333ac6e53f in __libc_message (do_abort=<optimized out>, fmt=<optimized out>) at ../sysdeps/posix/libc_fatal.c:172
		#16 0x000000333ac7830e in malloc_printerr (action=<optimized out>, str=<optimized out>, ptr=<optimized out>) at malloc.c:4920
		#17 0x000000333ac79e19 in _int_malloc (av=<optimized out>, bytes=<optimized out>) at malloc.c:3395
		#18 0x000000333ac7b990 in __GI___libc_malloc (bytes=13845) at malloc.c:2855
		#19 0x0000000000479665 in confd_malloc ()
		#20 0x000000000047bb04 in term_read ()
		#21 0x00000000004a7c49 in term_read_list ()
		#22 0x00000000004ab2f7 in load_many_ns ()
		#23 0x00000000004abe97 in maapi_load_schemas ()
		#24 0x000000000047e9fe in confd_load_schemas ()
		#25 0x000000000046dd64 in eb_confd_init () at confd_thread.c:1848

Below is the code snippet:

int eb_confd_init(void)
{
    enum confd_debug_level dbg_lvl = CONFD_SILENT; // CONFD_TRACE, CONFD_DEBUG
    struct confd_action_cbs eb_acb;
    struct confd_trans_cbs eb_trans;

    confd_init(EB_CONFD_APP, log_f_ptr, dbg_lvl);

    if( CONFD_OK != (confd_load_schemas((struct sockaddr *)&eb_addr, sizeof(eb_addr))) ) {
        return FAILURE_VAL;
    }

	if ((eb_dctx = confd_init_daemon(EB_CONFD_APP)) == NULL) {
        return FAILURE_VAL;
    }

    if ((eb_ctlsock = socket(AF_INET, SOCK_STREAM, 0)) < 0 ){
        goto free_ctx;
    }

    if ( CONFD_OK != confd_connect(eb_dctx, eb_ctlsock, CONTROL_SOCKET,
                     (struct sockaddr*)&eb_addr,sizeof (struct sockaddr_in))){
        goto free_ctlsock;
    }

	if ( (eb_workersock = socket(AF_INET, SOCK_STREAM, 0)) < 0 ){
        goto free_ctlsock;
    }

    if ( CONFD_OK != confd_connect(eb_dctx, eb_workersock, WORKER_SOCKET,
                     (struct sockaddr*)&eb_addr, sizeof (struct sockaddr_in)) ){
            goto free_workersock;
    }

    memset(&eb_acb, 0, sizeof(eb_acb));

    strcpy(eb_acb.actionpoint, EB_CONFD_AP);

    eb_acb.init   = init_action;
    eb_acb.action = do_action;
    eb_acb.abort  = abort_action;

    if ( CONFD_OK != confd_register_action_cbs(eb_dctx, &eb_acb) ){
        goto free_workersock;
    }

    memset(&eb_trans, 0, sizeof (eb_trans));
	eb_trans.init  = init_trans;

    if ( CONFD_OK != confd_register_trans_cb(eb_dctx, &eb_trans) ){
        goto free_workersock;
    }

    if (confd_register_done(eb_dctx) != CONFD_OK){
        goto free_workersock;
    }
    if ((eb_ss = socket(AF_INET, SOCK_STREAM, 0)) < 0 )
    {
        goto free_workersock;
    }
    if( CONFD_OK != cdb_connect(eb_ss, CDB_SUBSCRIPTION_SOCKET,
                    (struct sockaddr *)&eb_addr, sizeof(eb_addr))){
        goto free_sssock;
    }
	if( CONFD_OK != cdb_mandatory_subscriber(eb_ss, EB_CONFD_MS)){
        goto free_sssock;
    }
    if( CONFD_OK != cdb_subscribe2(eb_ss, CDB_SUB_RUNNING_TWOPHASE, 0,
                     CONFD_PRIORITY_LEVEL2, &spoint1, 0, EB_CONFD_ROOT)){
        goto free_sssock;
    }
    if( CONFD_OK != cdb_subscribe2(eb_ss, CDB_SUB_RUNNING_TWOPHASE, 0,
                     CONFD_PRIORITY_LEVEL1, &spoint2, 0, EB_CTX_ROOT)){
        goto free_sssock;
    }
    if( CONFD_OK != cdb_subscribe_done(eb_ss)){
        goto free_sssock;
    }
    if ( (eb_opersock = socket(AF_INET, SOCK_STREAM, 0)) < 0 ){
        goto free_sssock;
    }
	if( CONFD_OK != cdb_connect(eb_opersock, CDB_DATA_SOCKET,
                                (struct sockaddr *)&eb_addr, sizeof(eb_addr))){
        goto free_opersock;
    }
    if ((eb_maapi_sock = socket(AF_INET, SOCK_STREAM, 0)) < 0 ){
        goto free_opersock;
    }
    if ( CONFD_OK != maapi_connect( eb_maapi_sock, (struct sockaddr*)&eb_addr,
                                    sizeof (struct sockaddr_in))){
        goto free_maapi_sock;
    }
    return 0;

	free_maapi_sock:
		close(eb_maapi_sock);
	free_opersock:
		close(eb_opersock);
	free_sssock:
		close(eb_ss);
	free_workersock:
		close(eb_workersock);
	free_ctlsock:
		close(eb_ctlsock);
	free_ctx:
		confd_release_daemon(eb_dctx);

	return FAILURE_VAL ;
}

    int main(){
        	while ( true ){
                if ( eb_confd_init() < 0 ){
                   printf("Failed to initialize confd\n");
                   continue;
                }
        	}
        }

Code seems to be incomplete, several functions, #deifnes are probably missing.

I have tried to adapt your code to compile and test. I get CONFD_ERR during, no crash

 if ( CONFD_OK != cdb_subscribe2(eb_ss, CDB_SUB_RUNNING_TWOPHASE, 0,
    CONFD_PRIORITY_LEVEL2, &spoint1, 0, EB_CONFD_ROOT)) {
        printf("%s:%i", __FILE__, __LINE__);
        goto free_sssock;
    }

Please can you download example, review if it is similar to yours code and test?

https://cisco.box.com/s/j0duglmepqx0vayx2tao8ayhfhpmtt5p

make clean all start

It should display:

$ example.c:108Failed to initialize confd

Hi mnovak,

I will try to execute this thread detaching from the application like a sample program and update you with result.

Thanks
Durga Prakash

The reason you’re stuck in confd_load_schemas() is that the code is trying to grab a lock that is already taken.

I believe the reason this happens is that your thread is making nested calls to malloc() - the first time from confd_malloc() and second time from the pthread_once()/init() combination.

pthread_once() is used by glibc for dynamic package initialization (glibc discovers that it need to map in a new shared library).

Hi ,

I tried with a sample program and the loop iterates without any issues.

I have verified the stack trace of my application and observed that 3 threads are waiting at the same lock. This time process struck at the same point.

Please check the below trace of the threads and suggest any alternative to avoid this lock.

-------------------------------------------------------------------
(gdb) info threads
  Id   Target Id         Frame
  15   Thread 0x2b2940aef700 (LWP 5416) "app" 0x000000333acdf443 in select () at ../sysdeps/unix/syscall-template.S:81
  14   Thread 0x2b2940d0f700 (LWP 5418) "app" pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  13   Thread 0x2b2940f10700 (LWP 5419) "app" do_sigwait (set=<optimized out>, sig=0x2b2940f0ebec)
    at ../nptl/sysdeps/unix/sysv/linux/../../../../../sysdeps/unix/sysv/linux/sigwait.c:63
  12   Thread 0x2b2941111700 (LWP 5420) "app" pthread_cond_timedwait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:238
  11   Thread 0x2b2941312700 (LWP 5421) "app" __lll_lock_wait_private () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:95
  10   Thread 0x2b2941513700 (LWP 5422) "app" pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  9    Thread 0x2b2941714700 (LWP 5423) "app" pthread_cond_timedwait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S:238
  8    Thread 0x2b2941915700 (LWP 5424) "app" pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  7    Thread 0x2b2941b16700 (LWP 5478) "app" __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
  6    Thread 0x2b2941d17700 (LWP 5479) "app" __lll_lock_wait_private () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:95
  5    Thread 0x2b2941f18700 (LWP 5480) "app" 0x00002b2939f80a9d in open64 () at ../sysdeps/unix/syscall-template.S:81
  4    Thread 0x2b2942119700 (LWP 5482) "app" __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
  3    Thread 0x2b294231a700 (LWP 5483) "app" pthread_cond_wait@@GLIBC_2.3.2 ()
    at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185
  2    Thread 0x2b294271c700 (LWP 5485) "app" __lll_lock_wait_private () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:95
* 1    Thread 0x2b2940637b80 (LWP 5412) "app" 0x00002b2939f7a6a2 in pthread_join (threadid=47456200001280, thread_return=0x0)
    at pthread_join.c:92
------------------------------
(gdb) thread 2
[Switching to thread 2 (Thread 0x2b294271c700 (LWP 5485))]
#0  __lll_lock_wait_private () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:95
95      ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S: No such file or directory.
(gdb) bt
#0  __lll_lock_wait_private () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:95
#1  0x000000333ac7e14a in _L_lock_10423 () at malloc.c:5147
#2  0x000000333ac7b985 in __GI___libc_malloc (bytes=13845) at malloc.c:2852
#3  0x0000000000479665 in confd_malloc ()
#4  0x000000000047bb04 in term_read ()
#5  0x00000000004a7c49 in term_read_list ()
#6  0x00000000004ab2f7 in load_many_ns ()
#7  0x00000000004abe97 in maapi_load_schemas ()
#8  0x000000000047e9fe in confd_load_schemas ()
#9  0x000000000046dd64 in eb_confd_init () at confd_thread.c:1848
#10 0x000000000046f441 in eB_confd_thread (val=0x0) at confd_thread.c:2333
#11 0x00002b2939f79201 in start_thread (arg=0x2b294271c700) at pthread_create.c:312
#12 0x000000333ace660d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
-------------------------
(gdb) thread 6
[Switching to thread 6 (Thread 0x2b2941d17700 (LWP 5479))]
#0  __lll_lock_wait_private () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:95
95      in ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
(gdb) bt
#0  __lll_lock_wait_private () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:95
#1  0x000000333ac7e14a in _L_lock_10423 () at malloc.c:5147
#2  0x000000333ac7b985 in __GI___libc_malloc (bytes=56) at malloc.c:2852
#3  0x000000333a40c74e in _dl_map_object_deps (map=map@entry=0x2b293df76ab8, preloads=preloads@entry=0x0, npreloads=npreloads@entry=0,
    trace_mode=trace_mode@entry=0, open_mode=open_mode@entry=-2147483648) at dl-deps.c:511
#4  0x000000333a41292e in dl_open_worker (a=a@entry=0x2b2941d0f0f8) at dl-open.c:261
#5  0x000000333a40e6d4 in _dl_catch_error (objname=objname@entry=0x2b2941d0f0e8, errstring=errstring@entry=0x2b2941d0f0f0,
    mallocedp=mallocedp@entry=0x2b2941d0f0e7, operate=operate@entry=0x333a412820 <dl_open_worker>, args=args@entry=0x2b2941d0f0f8)
    at dl-error.c:177
#6  0x000000333a4122eb in _dl_open (file=0x333ad61a46 "libgcc_s.so.1", mode=-2147483647, caller_dlopen=<optimized out>, nsid=-2, argc=2,
    argv=0x7fff1a328048, env=0x284a4c0) at dl-open.c:650
#7  0x000000333ad1be72 in do_dlopen (ptr=<optimized out>) at dl-libc.c:87
#8  0x000000333a40e6d4 in _dl_catch_error (objname=0x2b2941d0f2f0, errstring=0x2b2941d0f2f8, mallocedp=0x2b2941d0f2ef,
    operate=0x333ad1be30 <do_dlopen>, args=0x2b2941d0f310) at dl-error.c:177
#9  0x000000333ad1bf0f in dlerror_run (operate=<optimized out>, args=<optimized out>) at dl-libc.c:46
#10 0x000000333ad1bf81 in __GI___libc_dlopen_mode (name=<optimized out>, mode=<optimized out>) at dl-libc.c:163
#11 0x000000333acf5a75 in init () at ../sysdeps/x86_64/backtrace.c:52
#12 0x00002b2939f7e9a3 in pthread_once () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S:103
#13 0x000000333acf5b8c in __GI___backtrace (array=<optimized out>, size=<optimized out>) at ../sysdeps/x86_64/backtrace.c:103
#14 0x000000333ac1fe0b in backtrace_and_maps (do_abort=989484576, do_abort@entry=2, written=128, fd=2)
    at ../sysdeps/unix/sysv/linux/libc_fatal.c:49
#15 0x000000333ac6e53f in __libc_message (do_abort=<optimized out>, fmt=<optimized out>) at ../sysdeps/posix/libc_fatal.c:172
#16 0x000000333ac7830e in malloc_printerr (action=<optimized out>, str=<optimized out>, ptr=<optimized out>) at malloc.c:4920
#17 0x000000333ac79e19 in _int_malloc (av=<optimized out>, bytes=<optimized out>) at malloc.c:3395
#18 0x000000333ac7b990 in __GI___libc_malloc (bytes=320) at malloc.c:2855
#19 0x00000000004519df in add_connection (connection=0x2b704a0) at server_thread.c:222
#20 0x0000000000454c0b in add_card_info (sd=19, resp=0x2b2941d142f0) at server_thread.c:492
#21 0x00000000004594b5 in handle_aswd_responses (pMsg=0x2b2941d142f0, k=19) at server_thread.c:951
#22 0x00000000004586eb in handle_client_connections (p=0x0) at server_thread.c:795
#23 0x00002b2939f79201 in start_thread (arg=0x2b2941d17700) at pthread_create.c:312
#24 0x000000333ace660d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
---------------------------
(gdb) thread 7
[Switching to thread 7 (Thread 0x2b2941b16700 (LWP 5478))]
#0  __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
135     ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S: No such file or directory.
(gdb) bt
#0  __lll_lock_wait () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
#1  0x00002b2939f7b6f7 in _L_lock_942 () from /lib64/libpthread.so.0
#2  0x00002b2939f7b520 in __GI___pthread_mutex_lock (mutex=0x6fcea0 <conn_lock>) at ../nptl/pthread_mutex_lock.c:79
#3  0x000000000045f1b3 in sendto_standby_rp_connection_list () at redundancy_thread.c:517
#4  0x000000000045c261 in manager_redundancy_server (p=0x0) at redundancy_thread.c:232
#5  0x00002b2939f79201 in start_thread (arg=0x2b2941b16700) at pthread_create.c:312
#6  0x000000333ace660d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
-----------------------
(gdb) thread 11
[Switching to thread 11 (Thread 0x2b2941312700 (LWP 5421))]
#0  __lll_lock_wait_private () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:95
95      ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S: No such file or directory.
(gdb) bt
#0  __lll_lock_wait_private () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:95
#1  0x000000333ac7e29e in _L_lock_11681 () at malloc.c:5147
#2  0x000000333ac7c29f in __libc_calloc (n=<optimized out>, elem_size=<optimized out>) at malloc.c:3161
#3  0x00002b2936de9465 in rbnl_evtmd_thread_rx () from /usr/lib/siara/lib64/librbn_events.so.0.0
#4  0x00002b2939f79201 in start_thread (arg=0x2b2941312700) at pthread_create.c:312
#5  0x000000333ace660d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111
(gdb)

Thanks
Durga Prakash

Since all threads in a linux process share the same heap it’s no mystery that several threads block once thread 6 in this case deadlocks trying to grab the same heap lock it already holds.

You have to figure out why the first malloc() error happens. In this example something goes wrong and malloc() calls malloc_printerr(), frame 16 in thread 6. In case you can’t figure out what’s wrong by looking at the code you run your app in gdb and set a breakpoint on malloc_printerr().

To be able to set the breakpoint on malloc_printerr() you probably have to install the debug package for glibc first.