TensorFlow: InternalError: Blas SGEMM launch failed

tensorflow blas

60,541

Solution 1

Old question, but may help others.
Try to close interactive sessions active in other processes (if IPython Notebook - just restart kernels). This helped me!

Additionally, I use this code to close local sessions in this kernel during experiments:

if 'session' in locals() and session is not None:
    print('Close interactive session')
    session.close()

Solution 2

I encountered this problem and solved it by setting allow_soft_placement=True and gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3), which specifically define the fraction of memory of GPU been used. I guess this has helped to avoid two tensorflow processes competing for the GPU memory.

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)
sess = tf.Session(config=tf.ConfigProto(
  allow_soft_placement=True, log_device_placement=True))

Solution 3

I got this error when running Tensorflow Distributed. Did you check if any of the workers were reporting CUDA_OUT_OF_MEMORY errors? If this is the case it may have to do with where you place your weight and bias variables. E.g.

with tf.device("/job:paramserver/task:0/cpu:0"):
   W = weight_variable([input_units, num_hidden_units])       
   b = bias_variable([num_hidden_units])

Solution 4

My environment is Python 3.5, Tensorflow 0.12 and Windows 10 (no Docker). I am training neural networks in both CPU and GPU. I came across the same error InternalError: Blas SGEMM launch failed whenever training in the GPU.

I could not find the reason why this error happens but I managed to run my code in the GPU by avoiding the tensorflow function tensorflow.contrib.slim.one_hot_encoding(). Instead, I do the one-hot-encoding operation in numpy (input and output variables).

The following code reproduces the error and the fix. It is a minimal setup to learn the y = x ** 2 function using gradient descent.

import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim

def test_one_hot_encoding_using_tf():

    # This function raises the "InternalError: Blas SGEMM launch failed" when run in the GPU

    # Initialize
    tf.reset_default_graph()
    input_size = 10
    output_size = 100
    input_holder = tf.placeholder(shape=[1], dtype=tf.int32, name='input')
    output_holder = tf.placeholder(shape=[1], dtype=tf.int32, name='output')

    # Define network
    input_oh = slim.one_hot_encoding(input_holder, input_size)
    output_oh = slim.one_hot_encoding(output_holder, output_size)
    W1 = tf.Variable(tf.random_uniform([input_size, output_size], 0, 0.01))
    output_v = tf.matmul(input_oh, W1)
    output_v = tf.reshape(output_v, [-1])

    # Define updates
    loss = tf.reduce_sum(tf.square(output_oh - output_v))
    trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
    update_model = trainer.minimize(loss)

    # Optimize
    init = tf.initialize_all_variables()
    steps = 1000

    # Force CPU/GPU
    config = tf.ConfigProto(
        # device_count={'GPU': 0}  # uncomment this line to force CPU
    )

    # Launch the tensorflow graph
    with tf.Session(config=config) as sess:
        sess.run(init)

        for step_i in range(steps):

            # Get sample
            x = np.random.randint(0, 10)
            y = np.power(x, 2).astype('int32')

            # Update
            _, l = sess.run([update_model, loss], feed_dict={input_holder: [x], output_holder: [y]})

        # Check model
        print('Final loss: %f' % l)

def test_one_hot_encoding_no_tf():

    # This function does not raise the "InternalError: Blas SGEMM launch failed" when run in the GPU

    def oh_encoding(label, num_classes):
        return np.identity(num_classes)[label:label + 1].astype('int32')

    # Initialize
    tf.reset_default_graph()
    input_size = 10
    output_size = 100
    input_holder = tf.placeholder(shape=[1, input_size], dtype=tf.float32, name='input')
    output_holder = tf.placeholder(shape=[1, output_size], dtype=tf.float32, name='output')

    # Define network
    W1 = tf.Variable(tf.random_uniform([input_size, output_size], 0, 0.01))
    output_v = tf.matmul(input_holder, W1)
    output_v = tf.reshape(output_v, [-1])

    # Define updates
    loss = tf.reduce_sum(tf.square(output_holder - output_v))
    trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
    update_model = trainer.minimize(loss)

    # Optimize
    init = tf.initialize_all_variables()
    steps = 1000

    # Force CPU/GPU
    config = tf.ConfigProto(
        # device_count={'GPU': 0}  # uncomment this line to force CPU
    )

    # Launch the tensorflow graph
    with tf.Session(config=config) as sess:
        sess.run(init)

        for step_i in range(steps):

            # Get sample
            x = np.random.randint(0, 10)
            y = np.power(x, 2).astype('int32')

            # One hot encoding
            x = oh_encoding(x, 10)
            y = oh_encoding(y, 100)

            # Update
            _, l = sess.run([update_model, loss], feed_dict={input_holder: x, output_holder: y})

        # Check model
        print('Final loss: %f' % l)

Solution 5

maybe you not free your gpu rigthly , if you are using linux,try "ps -ef | grep python" to see what jobs are using GPU. then kill them

View more solutions

60,541

Author by

rafaelcosman

Stanford CS grad interested in ML.

Updated on July 05, 2022

Comments

rafaelcosman about 2 years

When I run sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) I get InternalError: Blas SGEMM launch failed. Here is the full error and stack trace:

InternalErrorTraceback (most recent call last)
<ipython-input-9-a3261a02bdce> in <module>()
      1 batch_xs, batch_ys = mnist.train.next_batch(100)
----> 2 sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
    338     try:
    339       result = self._run(None, fetches, feed_dict, options_ptr,
--> 340                          run_metadata_ptr)
    341       if run_metadata:
    342         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
    562     try:
    563       results = self._do_run(handle, target_list, unique_fetches,
--> 564                              feed_dict_string, options, run_metadata)
    565     finally:
    566       # The movers are no longer used. Delete them.

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)
    635     if handle is None:
    636       return self._do_call(_run_fn, self._session, feed_dict, fetch_list,
--> 637                            target_list, options, run_metadata)
    638     else:
    639       return self._do_call(_prun_fn, self._session, handle, feed_dict,

/usr/local/lib/python2.7/dist-packages/tensorflow/python/client/session.pyc in _do_call(self, fn, *args)
    657       # pylint: disable=protected-access
    658       raise errors._make_specific_exception(node_def, op, error_message,
--> 659                                             e.code)
    660       # pylint: enable=protected-access
    661 

InternalError: Blas SGEMM launch failed : a.shape=(100, 784), b.shape=(784, 10), m=100, n=10, k=784
     [[Node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/gpu:0"](_recv_Placeholder_0/_4, Variable/read)]]
Caused by op u'MatMul', defined at:
  File "/usr/lib/python2.7/runpy.py", line 162, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/usr/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py", line 3, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python2.7/dist-packages/traitlets/config/application.py", line 596, in launch_instance
    app.start()
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelapp.py", line 442, in start
    ioloop.IOLoop.instance().start()
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/ioloop.py", line 162, in start
    super(ZMQIOLoop, self).start()
  File "/usr/local/lib/python2.7/dist-packages/tornado/ioloop.py", line 883, in start
    handler_func(fd_obj, events)
  File "/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py", line 391, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python2.7/dist-packages/ipykernel/ipkernel.py", line 199, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2723, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2825, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py", line 2885, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-4-d7414c4b6213>", line 4, in <module>
    y = tf.nn.softmax(tf.matmul(x, W) + b)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/math_ops.py", line 1036, in matmul
    name=name)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/gen_math_ops.py", line 911, in _mat_mul
    transpose_b=transpose_b, name=name)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/ops/op_def_library.py", line 655, in apply_op
    op_def=op_def)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 2154, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/framework/ops.py", line 1154, in __init__
    self._traceback = _extract_stack()

Stack: EC2 g2.8xlarge machine, Ubuntu 14.04