check the execution of tensorflow with gdb

发表于 2018-04-29 | 阅读次数

Here I list some callstack of tensorflow which is collected through gdb.

0x1 Session::Run()

(gdb) bt 
#0 tensorflow::DirectSession::Run (this=0x55a688ed3660, run_options=..., inputs=std::vector of length 3, capacity 3 = {...},   output_names=std::vector of length 0, capacity 0, target_nodes=std::vector of length 1, capacity 1 = {...}, outputs=0x7f375da478e0,   run_metadata=0x7f375da47930) at tensorflow/core/common_runtime/direct_session.cc:439 
#1 0x00007f3784cea307 in TF_Run_Helper (session=0x55a688ed3660, handle=0x0, run_options=0x0,   input_pairs=std::vector of length 3, capacity 3 = {...}, output_tensor_names=std::vector of length 0, capacity 0,   c_outputs=0x7f375da47d00, target_oper_names=std::vector of length 1, capacity 1 = {...}, run_metadata=0x0, status=0x7f37200d5e80)  at tensorflow/c/c_api.cc:680 
#2 0x00007f3784cea874 in TF_Run (s=0x55a688ed2af0, run_options=0x0, c_input_names=0x7f375da47c60, c_inputs=0x7f375da47cb0, ninputs=3,   c_output_names=0x7f375da48060, c_outputs=0x7f375da47d00, noutputs=0, c_target_oper_names=0x7f375da480b0, ntargets=1, run_metadata=0x0,   status=0x7f37200d5e80) at tensorflow/c/c_api.cc:735 
#3 0x00007f37847382dd in tensorflow::TF_Run_wrapper_helper (session=0x55a688ed2af0, handle=0x0, run_options=0x0, feed_dict=0x7f3762b029d8,   output_names=..., target_nodes=..., out_status=0x7f37200d5e80, out_values=0x7f375da48100, run_outputs=0x0)  at tensorflow/python/client/tf_session_helper.cc:96 
#4 0x00007f3784738936 in tensorflow::TF_Run_wrapper (session=0x55a688ed2af0, run_options=0x0, feed_dict=0x7f3762b029d8, output_names=...,   target_nodes=..., out_status=0x7f37200d5e80, out_values=0x7f375da48100, run_outputs=0x0)  at tensorflow/python/client/tf_session_helper.cc:149 
#5 0x00007f37846bf7e4 in _wrap_TF_Run (args=0x7f376350cdb0) at bazel-out/k8-py3-dbg/bin/tensorflow/python/pywrap_tensorflow_internal.cc:15067 
#6 0x000055a68528b2a1 in _PyCFunction_FastCallDict () 
#7 0x000055a68531e0a0 in call_function () 
#8 0x000055a68533f62a in _PyEval_EvalFrameDefault () 
#9 0x000055a685318c78 in PyEval_EvalCodeEx ()

0x2 OpKernel::Compute()

(gdb) bt
#0 0x00007f377fe7ee70 in tensorflow::OpKernelContext::input(int)@plt ()from /home/kevin/anaconda3/lib/python3.6/site-packages/tensorflow/python/../libtensorflow_framework.so
#1 0x00007f378001d8dd in tensorflow::HandleFromInput (ctx=0x7f3724169150, input=0) at tensorflow/core/framework/resource_mgr.cc:279
#2 0x00007f37859298b8 in tensorflow::QueueOpKernel::ComputeAsync(tensorflow::OpKernelContext*, std::function<void ()>) (this=0x7f37200cbfb0, ctx=0x7f3724169150, callback=...) at tensorflow/core/kernels/queue_ops.cc:37
#3 0x00007f378482cab8 in tensorflow::Device::ComputeAsync(tensorflow::AsyncOpKernel*, tensorflow::OpKernelContext*, std::function<void ()>)(this=0x55a688ee3260, op_kernel=0x7f37200cbfb0, context=0x7f3724169150, done=...) at ./tensorflow/core/common_runtime/device.h:89
#4 0x00007f37806b1b6e in tensorflow::(anonymous namespace)::ExecutorState::Process (this=0x7f37200d3440, tagged_node=..., scheduled_usec=0)at tensorflow/core/common_runtime/executor.cc:1647
#5 0x00007f37806c01a7 in std::_Mem_fn_base<void (tensorflow::(anonymous namespace)::ExecutorState::*)(tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long), true>::operator()<tensorflow::(anonymous namespace)::ExecutorState::TaggedNode&, long long&, void> (this=0x7f37240337a0, __object=0x7f37200d3440) at /usr/include/c++/5/functional:600
#6 0x00007f37806bfc62 in std::_Bind<std::_Mem_fn<void (tensorflow::(anonymous namespace)::ExecutorState::*)(tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>(tensorflow::(anonymous namespace)::ExecutorState*, tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>::__call<void, 0ul, 1ul, 2ul>(<unknown type in /home/kevin/anaconda3/lib/python3.6/site-packages/tensorflow/python/../libtensorflow_framework.so, CU 0x314f884, DIE 0x31e50c1>, std::_Index_tuple<0ul, 1ul, 2ul>) (this=0x7f37240337a0, __args=<unknown type in /home/kevin/anaconda3/lib/python3.6/site-packages/tensorflow/python/../libtensorflow_framework.so, CU 0x314f884, DIE 0x31e50c1>) at /usr/include/c++/5/functional:1074
#7 0x00007f37806bda06 in std::_Bind<std::_Mem_fn<void (tensorflow::(anonymous namespace)::ExecutorState::*)(tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>(tensorflow::(anonymous namespace)::ExecutorState*, tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>::operator()<, void>(void) (this=0x7f37240337a0) at /usr/include/c++/5/functional:1133
#8 0x00007f37806bb2ac in std::_Function_handler<void(), std::_Bind<std::_Mem_fn<void (tensorflow::(anonymous namespace)::ExecutorState::*)(tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>(tensorflow::(anonymous namespace)::ExecutorState*, tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)> >::_M_invoke(const std::_Any_data &) (__functor=...)at /usr/include/c++/5/functional:1871
#9 0x00007f377ff7f984 in std::function<void ()>::operator()() const (this=0x7f372407f320) at /usr/include/c++/5/functional:2267
#10 0x00007f3780197b9e in tensorflow::thread::EigenEnvironment::ExecuteTask (this=0x55a688ed3c58, t=...)at tensorflow/core/lib/core/threadpool.cc:81
#11 0x00007f378019a65c in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WorkerLoop (this=0x55a688ed3c50, thread_id=1) at external/eigen_archive/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h:232
#12 0x00007f3780198aae in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::NonBlockingThreadPoolTempl(int, bool, tensorflow::thread::EigenEnvironment)::{lambda()#1}::operator()() const ()at external/eigen_archive/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h:65
#13 0x00007f378019bc7c in std::_Function_handler<void (), Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::NonBlockingThreadPoolTempl(int, bool, tensorflow::thread::EigenEnvironment)::{lambda()#1}>::_M_invoke(std::_Any_data const&) (__functor=...)at /usr/include/c++/5/functional:1871
#14 0x00007f377ff7f984 in std::function<void ()>::operator()() const (this=0x55a688ed3050) at /usr/include/c++/5/functional:2267
#15 0x00007f3780197907 in tensorflow::thread::EigenEnvironment::CreateThread(std::function<void ()>)::{lambda()#1}::operator()() const (__closure=0x55a688ed3050) at tensorflow/core/lib/core/threadpool.cc:56
#16 0x00007f37801998d8 in std::_Function_handler<void (), tensorflow::thread::EigenEnvironment::CreateThread(std::function<void ()>)::{lambda()#1}>::_M_invoke(std::_Any_data const&) (__functor=...) at /usr/include/c++/5/functional:1871
#17 0x00007f377ff7f984 in std::function<void ()>::operator()() const (this=0x55a688f199b8) at /usr/include/c++/5/functional:2267
#18 0x00007f37801dcf38 in std::_Bind_simple<std::function<void ()> ()>::_M_invoke<>(std::_Index_tuple<>) (this=0x55a688f199b8)at /usr/include/c++/5/functional:1531
#19 0x00007f37801dcea1 in std::_Bind_simple<std::function<void ()> ()>::operator()() (this=0x55a688f199b8)at /usr/include/c++/5/functional:1520
#20 0x00007f37801dce40 in std::thread::_Impl<std::_Bind_simple<std::function<void ()> ()> >::_M_run() (this=0x55a688f199a0)at /usr/include/c++/5/thread:115
#21 0x00007f377ed4fc80 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#22 0x00007f3796f746ba in start_thread (arg=0x7f3760247700) at pthread_create.c:333#23 0x00007f3796caa3dd in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

0x3 Producer of tenforflow

It is trigged by test.py to send command to tenforflow native code.

0x4 Consumer of tenforflow

Tensorflow’s thread pull task from the queue then execute it

0x5 gdb debug of convolutional_network.py

(gdb) bt
#0 0x00007f90a7e2d8f0 in tensorflow::ConvBackpropComputeDimensions(tensorflow::StringPiece, int, tensorflow::TensorShape const&, tensorflow::TensorShape const&, tensorflow::TensorShape const&, std::vector<int, std::allocator<int> > const&, tensorflow::Padding, tensorflow::TensorFormat, tensorflow::ConvBackpropDimensions*)@plt ()  from /home/kevin/anaconda3/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so 
#1 0x00007f90ab73bb73 in tensorflow::Conv2DCustomBackpropInputOp<Eigen::ThreadPoolDevice, float>::Compute (this=0x556909e28fa0, context=0x7f9083774840)  at tensorflow/core/kernels/conv_grad_input_ops.cc:311 
#2 0x00007f90a3ee091b in tensorflow::ThreadPoolDevice::Compute (  this=0x5569092d1f30, op_kernel=0x556909e28fa0, context=0x7f9083774840)  at tensorflow/core/common_runtime/threadpool_device.cc:59 
#3 0x00007f90a3e7bc0a in tensorflow::(anonymous namespace)::ExecutorState::Process (this=0x556909f04770, tagged_node=..., scheduled_usec=0)  at tensorflow/core/common_runtime/executor.cc:1652 
#4 0x00007f90a3e8a1a7 in std::_Mem_fn_base<void (tensorflow::(anonymous namespace)::ExecutorState::*)(tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long), true>::operator()<tensorflow::(anonymous namespace)::ExecutorState::TaggedNode&, long long&, void> (this=0x7f904c695d50,   __object=0x556909f04770) at /usr/include/c++/5/functional:600 
#5 0x00007f90a3e89c62 in std::_Bind<std::_Mem_fn<void (tensorflow::(anonymous namespace)::ExecutorState::*)(tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>(tensorflow::(anonymous namespace)::ExecutorState*, tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>::__call<void, 0ul, 1ul, 2ul>(<unknown type in /home/kevin/anaconda3/lib/python3.6/site-packages/tensorflow/python/../libtensorflow_framework.so, CU 0x314f884, DIE 0x31e50c1>, std::_Index_tuple<0ul, 1ul, 2ul>) (this=0x7f904c695d50,  ---Type <return> to continue, or q <return> to quit---  __args=<unknown type in /home/kevin/anaconda3/lib/python3.6/site-packages/tensorflow/python/../libtensorflow_framework.so, CU 0x314f884, DIE 0x31e50c1>)  at /usr/include/c++/5/functional:1074 
#6 0x00007f90a3e87a06 in std::_Bind<std::_Mem_fn<void (tensorflow::(anonymous namespace)::ExecutorState::*)(tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>(tensorflow::(anonymous namespace)::ExecutorState*, tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>::operator()<, void>(void) (this=0x7f904c695d50)  at /usr/include/c++/5/functional:1133 
#7 0x00007f90a3e852ac in std::_Function_handler<void(), std::_Bind<std::_Mem_fn<void (tensorflow::(anonymous namespace)::ExecutorState::*)(tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>(tensorflow::(anonymous namespace)::ExecutorState*, tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)> >::_M_invoke(const std::_Any_data &) (__functor=...)  at /usr/include/c++/5/functional:1871 
#8 0x00007f90a3749984 in std::function<void ()>::operator()() const (  this=0x7f904c6c5d40) at /usr/include/c++/5/functional:2267 #9 0x00007f90a3961b9e in tensorflow::thread::EigenEnvironment::ExecuteTask (  this=0x5569091d82c8, t=...) at tensorflow/core/lib/core/threadpool.cc:81 
#10 0x00007f90a396465c in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WorkerLoop (this=0x5569091d82c0, thread_id=1)  at external/eigen_archive/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h:232 
#11 0x00007f90a3962aae in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::NonBlockingThreadPoolTempl(int, bool, tensorflow::thread::EigenEnvironment)::{lambda()#1}::operator()() const ()  at external/eigen_archive/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h:65

(gdb) c Continuing. 
[Switching to Thread 0x7f9083775700 (LWP 3586)]   
Thread 13 "python" hit Breakpoint 5, 0x00007f90a7e2d8f0 in tensorflow::ConvBackpropComputeDimensions(tensorflow::StringPiece, int, tensorflow::TensorShape const&, tensorflow::TensorShape const&, tensorflow::TensorShape const&, std::vector<int, std::allocator<int> > const&, tensorflow::Padding, tensorflow::TensorFormat, tensorflow::ConvBackpropDimensions*)@plt ()  from /home/kevin/anaconda3/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so 
(gdb) n 
Single stepping until exit from function _ZN10tensorflow29ConvBackpropComputeDimensionsENS_11StringPieceEiRKNS_11TensorShapeES3_S3_RKSt6vectorIiSaIiEENS_7PaddingENS_12TensorFormatEPNS_22ConvBackpropDimensionsE@plt, which has no line number information. [Switching to Thread 0x7f90819b9700 (LWP 3587)]   Thread 14 "python" hit Breakpoint 5, 0x00007f90a7e2d8f0 in tensorflow::ConvBackpropComputeDimensions(tensorflow::StringPiece, int, tensorflow::TensorShape const&, tensorflow::TensorShape const&, tensorflow::TensorShape const&, std::vector<int, std::allocator<int> > const&, tensorflow::Padding, tensorflow::TensorFormat, tensorflow::ConvBackpropDimensions*)@plt ()  from /home/kevin/anaconda3/lib/python3.6/site-packages/tensorflow/python/_pywrap_tensorflow_internal.so 
(gdb)  
Single stepping until exit from function _ZN10tensorflow29ConvBackpropComputeDimensionsENS_11StringPieceEiRKNS_11TensorShapeES3_S3_RKSt6vectorIiSaIiEENS_7PaddingENS_12TensorFormatEPNS_22ConvBackpropDimensionsE@plt, which has no line number information. tensorflow::ConvBackpropComputeDimensions (label=..., num_spatial_dims=32656,   input_shape=..., filter_shape=..., out_backprop_shape=...,   strides=std::vector of length 4, capacity 4 = {...},   padding=tensorflow::VALID, data_format=tensorflow::FORMAT_NHWC,   dims=0x7f90819b8240) at tensorflow/core/kernels/conv_grad_ops.cc:156 156  ConvBackpropDimensions* dims) { (gdb) n [Switching to Thread 0x7f9083775700 (LWP 3586)]   Thread 13 "python" hit Breakpoint 5, tensorflow::ConvBackpropComputeDimensions  (label=..., num_spatial_dims=2, input_shape=..., filter_shape=...,   out_backprop_shape=...,   strides=std::vector of length 4, capacity 4 = {...},   padding=tensorflow::VALID, data_format=tensorflow::FORMAT_NHWC,   dims=0x7f9083774240) at tensorflow/core/kernels/conv_grad_ops.cc:160 160  one_dilations, strides, padding, data_format, dims);

(gdb)
#0 std::operator==<tensorflow::Status::State, std::default_delete<tensorflow::Status::State> >(std::unique_ptr<tensorflow::Status::State, std::default_delete<tensorflow::Status::State> > const&, decltype(nullptr)) (  __x=std::unique_ptr<tensorflow::Status::State> containing 0x0)  at /usr/include/c++/5/bits/unique_ptr.h:631 
#1 0x00007f90a7e9e41d in tensorflow::Status::ok (this=0x7f90819b8150)  at ./tensorflow/core/lib/core/status.h:53 
#2 0x00007f90ab73bb86 in tensorflow::Conv2DCustomBackpropInputOp<Eigen::ThreadPoolDevice, float>::Compute (this=0x556909e5c730, context=0x7f90819b8840)  at tensorflow/core/kernels/conv_grad_input_ops.cc:311 
#3 0x00007f90a3ee091b in tensorflow::ThreadPoolDevice::Compute (  this=0x5569092d1f30, op_kernel=0x556909e5c730, context=0x7f90819b8840)  at tensorflow/core/common_runtime/threadpool_device.cc:59 
#4 0x00007f90a3e7bc0a in tensorflow::(anonymous namespace)::ExecutorState::Process (this=0x55690a0f5150, tagged_node=..., scheduled_usec=0)  at tensorflow/core/common_runtime/executor.cc:1652 
#5 0x00007f90a3e8a1a7 in std::_Mem_fn_base<void (tensorflow::(anonymous namespace)::ExecutorState::*)(tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long), true>::operator()<tensorflow::(anonymous namespace)::ExecutorState::TaggedNode&, long long&, void> (this=0x7f90508b25b0,   __object=0x55690a0f5150) at /usr/include/c++/5/functional:600 
#6 0x00007f90a3e89c62 in std::_Bind<std::_Mem_fn<void (tensorflow::(anonymous namespace)::ExecutorState::*)(tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>(tensorflow::(anonymous namespace)::ExecutorState*, tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>::__call<void, 0ul, 1ul, 2ul>(<unknown type in /home/kevin/anaconda3/lib/python3.6/site-packages/tensorflow/python/../libtensorflow_framework.so, CU 0x314f884, DIE 0x31e50c1>, std::_Index_tuple<0ul, 1ul, 2ul>) (this=0x7f90508b25b0,   __args=<unknown type in /home/kevin/anaconda3/lib/python3.6/site-packages/tensorflow/python/../libtensorflow_framework.so, CU 0x314f884, DIE 0x31e50c1>)  at /usr/include/c++/5/functional:1074 ---Type <return> to continue, or q <return> to quit---   
(gdb) n 
tensorflow::Conv2DCustomBackpropInputOp<Eigen::ThreadPoolDevice, float>::Compute (this=0x556909e5c730, context=0x7f90819b8840)  at tensorflow/core/kernels/conv_grad_input_ops.cc:317 317  Tensor* in_backprop = nullptr;
(gdb) list
 312  ConvBackpropComputeDimensions( 313  "Conv2DCustomBackpropInput", /*num_spatial_dims=*/2, 314  input_shape, filter.shape(), out_backprop.shape(), 315  strides_, padding_, data_format_, &dims)); 316  317  Tensor* in_backprop = nullptr; 318  OP_REQUIRES_OK(context, 319  context->allocate_output(0, input_shape, &in_backprop)); 320  321 // TODO(andydavis) Consider moving code shared with 
(gdb) n
318  OP_REQUIRES_OK(context, 
(gdb)

Thread info
(gdb) info threads
  Id Target Id Frame   
  1 Thread 0x7f90bab09700 (LWP 3530) "python" pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185  
  2 Thread 0x7f90b6a6a700 (LWP 3531) "python" 0x00007f90ba6f387f in __libc_recv (fd=4, buf=0x7f90b6a7afc8, n=4, flags=0)  at ../sysdeps/unix/sysv/linux/x86_64/recv.c:28  
  3 Thread 0x7f908fc55700 (LWP 3560) "python" pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185  
  4 Thread 0x7f908f454700 (LWP 3561) "python" pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185  
  5 Thread 0x7f908ec53700 (LWP 3562) "python" pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185  
  6 Thread 0x7f908e452700 (LWP 3563) "python" pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185  
  7 Thread 0x7f908693b700 (LWP 3578) "python" 0x00007f90ba6f2827 in futex_abstimed_wait_cancelable (private=0, abstime=0x0, expected=0,   futex_word=0x7f9068013280)  at ../sysdeps/unix/sysv/linux/futex-internal.h:205 
  8 Thread 0x7f9085f7a700 (LWP 3581) "python" pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185  
  9 Thread 0x7f9085779700 (LWP 3582) "python" pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185  
  10 Thread 0x7f9084f78700 (LWP 3583) "python" pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185  
  11 Thread 0x7f9084777700 (LWP 3584) "python" pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185  
  12 Thread 0x7f9083f76700 (LWP 3585) "python" tensorflow::ConvBackpropComputeDimensions (label=..., num_spatial_dims=2, input_shape=...,   filter_shape=..., out_backprop_shape=...,   strides=std::vector of length 4, capacity 4 = {...},   padding=tensorflow::VALID, data_format=tensorflow::FORMAT_NHWC,  ---Type <return> to continue, or q <return> to quit---  dims=0x7f9083f75240) at tensorflow/core/kernels/conv_grad_ops.cc:160  
  13 Thread 0x7f9083775700 (LWP 3586) "python" pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 * 
  14 Thread 0x7f90819b9700 (LWP 3587) "python" tensorflow::ConvBackpropComputeDimensions (label=..., num_spatial_dims=2, input_shape=...,   filter_shape=..., out_backprop_shape=...,   strides=std::vector of length 4, capacity 4 = {...},   padding=tensorflow::VALID, data_format=tensorflow::FORMAT_NHWC,   dims=0x7f90819b8240) at tensorflow/core/kernels/conv_grad_ops.cc:160  
  15 Thread 0x7f90811b8700 (LWP 3588) "python" pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185  
  16 Thread 0x7f90809b7700 (LWP 3589) "python" pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185  
  17 Thread 0x7f907dde7700 (LWP 3590) "python" 0x00007f90ba6f2827 in futex_abstimed_wait_cancelable (private=0, abstime=0x0, expected=0,   futex_word=0x7f9038001380)  at ../sysdeps/unix/sysv/linux/futex-internal.h:205

Design p2p streaming capturer based on flv

发表于 2018-04-22 | 阅读次数

0x1 Introduction

streaming capturer用于接收client端发送过来的媒体数据，client端类似于目前市面上的直播客户端，通过RTMP/RTSP协议把数据发送给streaming capturer。streaming capturer对接收到的数据进行解包，如果需要可以进行transcoding，然后把数据发送给后续的模块进行处理，这些模块可以把数据发送给其他服务器提供流媒体服务。下面介绍的p2p streaming capturer是把接收到和转码后的数据打包成P2P协议包，这些P2P协议包被发送给种子节点p2p seed node，p2p seed node提供p2p streaming的种子节点功能。播放器先连接到这个p2p seed node，获取播放所需要的媒体数据，而且后面进来的播放器可以从前面的播放器中得到部分播放数据，不需要都从p2p seed node处获取数据，从而显著地降低流媒体服务器的带宽负担。
Adobe FLV格式是一种流媒体格式，p2p streaming capturer接收到client发送过来的数据流以后，打包成FLV格式, 然后在FLV的基础上打包成P2P数据包的格式。

下面是系统架构图。

0x2 flv数据格式

下面简单介绍一下FLV数据格式。
如下所示，首先是9个byte的flv header。

0x21 The flv header

在flv header之后，是Tag和Size交织组成的数据部分。

0x22 The flv file body

0x23 flv tag

下面是flv tag数据部分。

0x3 基于flv的p2p streaming capturer的设计

流程图如下，

左边是p2p streaming capturer, 它与p2p seed node进行交互，p2p streaming capturer接收client发送的RTMP/RTSP数据包，经过解包/转码以后生成FLV数据，这个时候需要把FLV header和类似SPS/PPS之类的媒体类型信息保存起来，p2p streaming capturer还需要实现内部缓存buffer, 把打包好的FLV数据缓存起来，为了后面打包成P2P数据包发送给p2p seed node。
p2p streaming capturer连接上p2p seed node以后，先发送注册(Register)信息，然后发送媒体类型(MediaType)信息，MediaType包括SPS/PPS等信息，然后开始把前面缓存起来的数据按固定大小(16384)打包成P2P格式，然后发送给p2p seed node.
p2p seed node在p2p streaming capturer连接上来以后，需要把相关信息发送给tracker server，这样相当于tracker server增加了一个节目源。
播放器player可以连接到p2p seed node来得到播放数据。对于直播来说，由于player刚连接到p2p seed node的时候获取的数据可能并不是关键帧，这个时候不能立即播放，需要等待下一个关键帧的到来。为了减少播放等待时候，可以在p2p seed node上实现gop cache的功能，把最近一个关键帧开始的数据都保存到gop cache中，等播放器连接上来的时候，把gop cache中的数据发送给播放器，这样播放器就不需要等待。当有新的关键帧到来的时候，需要把gop cache中的数据清空，然后保存当前关键帧开始的数据。

Implement yuv2rgb gpu filter

发表于 2018-04-15 | 阅读次数

0x1 Introduction

本文我们将在GPU上实现yuv420p到rgb的转换，转换代码采用opengl es glsl实现。该转换代码在GPU上运行，为了更好地展示这个实现过程和体现转换效果，我们在Android平台上实现一个app应用。
详细代码请参考 YUVRender

YUV2RGB转换的算法原理如下，其中Y/U/V是从yuv420p数据中读取的三个分量。
B=1.164(Y−16)+2.018(U−128)
G=1.164(Y−16)−0.813(V−128)−0.391(U−128)
R=1.164(Y−16)+1.596(V−128)

0x2 Shader代码

0x21 vertex shader代码

vertex shader代码首先输入顶点坐标和纹理坐标。
输出变量gl_Position用于在计算顶点位置以后将其写入裁剪坐标。
输出变量vtexcoord是输入给fragment shader作为纹理坐标的。

precision mediump float;
varying   mediump vec2 vtexcoord;
attribute mediump vec4 position;
attribute mediump vec2 texcoord;
void main()
{
	gl_Position  = position;
	vtexcoord = texcoord;
}

0x22 fragment shader代码

fragment shader代码首先输入texture坐标, 该坐标首先经过了vertex shader处理，
然后又经过了Primitive Assembly阶段的clip和插值处理。
然后输入三个sampler2D，这三个sampler2D分别对应yuv420p的Y/U/V数据，通过texture2D这个内置的纹理访问函数，我们可以读取对应纹理坐标下的Y/U/V数据。
接下面的代码就是执行YUV到RGB的转换矩阵了，具体的矩阵可以参考前面的介绍。

precision mediump float;
varying   mediump vec2 vtexcoord;
uniform   lowp  sampler2D samplerY;
uniform   lowp  sampler2D samplerU;
uniform   lowp  sampler2D samplerV;
void main()
{
    mediump float y;
    mediump float u;
    mediump float v;
    lowp  vec3 rgb;
    mat3 convmatrix = mat3(vec3(1.164,  1.164, 1.164),
                           vec3(0.0,   -0.392, 2.017),
                           vec3(1.596, -0.813, 0.0));
    y = (texture2D(samplerY, vtexcoord).r - (16.0 / 255.0));
    u = (texture2D(samplerU, vtexcoord).r - (128.0 / 255.0));
    v = (texture2D(samplerV, vtexcoord).r - (128.0 / 255.0));
    rgb = convmatrix * vec3(y, u, v);
    gl_FragColor = vec4(rgb, 1.0);
}

0x3 具体实现

0x31 texture的创建

为了让GPU能读取到YUV的数据，需要创建3个texture, 这三个texture分别存放Y/U/V分量。

GLES20.glGenTextures(3, mTexture, 0);
for(int i = 0; i < 3; i++)
{
    GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + i);
    GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mTexture[i]);
    GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR);
    GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR);
    GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE);
    GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE);
}

0x32 yuv420p数据的读取

我们读取的yuv420p数据是foreman.qcif，分辨率是qcif(176x144), 从数据布局来看，首先是176x144大小的Y分量，然后是88x72大小的U分量, 最后是88x72大小的V分量。下面是读取YUV数据的详细代码。

public void getYuvData(String fileName)
{
    String res="";
    try{
        InputStream in = getResources().getAssets().open(fileName);
        int length = in.available();
        byte [] buffer = new byte[length];
        ByteBuffer byteBuffer = ByteBuffer.allocateDirect(length);
        mYuvBuffer = byteBuffer;
        in.read(buffer);
        in.close();
        mYuvBuffer.put(buffer);
        mYuvBuffer.position(0);
        mBufferU = ByteBuffer.allocateDirect(88*72);
        mBufferV = ByteBuffer.allocateDirect(88*72);
        mBufferU.put(buffer,176*144,88*72);
        mBufferU.position(0);
        mBufferV.put(buffer,176*144+88*72,88*72);
        mBufferV.position(0);
    }catch(Exception e){
        e.printStackTrace();
    }
}

0x33 texture的更新

YUV数据保持在pixels数组中，通过调用glTexImage2D把YUV数据输入到GPU driver中。
因为需要给YUV的三个texture分别输入，所以需要循环3次。

int[] planes    = { 0, 1, 2 };
int[] widths    = { width, width/2, width/2 };
int[] heights  = { height, height/2, height/2 };
for (int i = 0; i < 3; ++i)
{
    int plane = planes[i];
    GLES20.glActiveTexture(GLES20.GL_TEXTURE0 + i);
    GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, mTexture[i]);
    ShaderManager.checkGlError("glBindTexture");
    if(pixels[plane] == null)
        Log.e("YUV2RGBFilter", "pixels[plane] == null");
    GLES20.glTexImage2D(
            GLES20.GL_TEXTURE_2D,
            0,
            GLES20.GL_LUMINANCE,
            widths[plane],
            heights[plane],
            0,
            GLES20.GL_LUMINANCE,
            GLES20.GL_UNSIGNED_BYTE,
            pixels[plane]);
    ShaderManager.checkGlError("glTexImage2D");
    GLES20.glUniform1i(mTextureHandle[i], i);
    ShaderManager.checkGlError("glUniform1i");
}

0x34 更新其他参数

下面的代码是把顶点坐标和纹理坐标输入给GPU driver。

GLES20.glVertexAttribPointer(mPositionHandle, 3, GLES20.GL_FLOAT,
        false, 0, mVertexBuffer);
ShaderManager.checkGlError("glVertexAttribPointer");
GLES20.glEnableVertexAttribArray(mPositionHandle);
ShaderManager.checkGlError("glEnableVertexAttribArray");
GLES20.glVertexAttribPointer(mTexCoordHandle, 2, GLES20.GL_FLOAT,
        false, 0, mTexCoorBuffer);
ShaderManager.checkGlError("glVertexAttribPointer");
GLES20.glEnableVertexAttribArray(mTexCoordHandle);

0x4 调试中碰到的问题

代码写好以后，一运行，不能出来正确的图像，一直是绿屏。下面介绍一下解决绿屏的过程。
首先检查api调用是否正确，通过启用swiftshader，我们可以打印出所有的opengl es api调用log, 检查该app调用的opengl es api，没有发现问题。

01-01 04:50:52.439  8547  8615 E libGLESv2_swiftshader: ClearColor external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:710 ((GLclampf red = 1.000000, GLclampf green = 0.000000, GLclampf blue = 1.000000, GLclampf alpha = 1.000000))
01-01 04:50:52.439  8547  8615 E libGLESv2_swiftshader: Clear external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:692 ((GLbitfield mask = 4100))
01-01 04:50:52.451  8547  8615 E libGLESv2_swiftshader: UseProgram external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:5925 ((GLuint program = 3))
01-01 04:50:52.451  8547  8615 E libGLESv2_swiftshader: BindTexture external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:354 ((GLenum target = 0xDE1, GLuint texture = 1))
01-01 04:50:52.452  8547  8615 E libGLESv2_swiftshader: Uniform1iv external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:5545 ((GLint location = 0, GLsizei count = 1, const GLint* v = 0x7b00d3a1621c))
01-01 04:50:52.452  8547  8615 E libGLESv2_swiftshader: BindTexture external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:354 ((GLenum target = 0xDE1, GLuint texture = 2))
01-01 04:50:52.452  8547  8615 E libGLESv2_swiftshader: Uniform1iv external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:5545 ((GLint location = 1, GLsizei count = 1, const GLint* v = 0x7b00d3a1621c))
01-01 04:50:52.452  8547  8615 E libGLESv2_swiftshader: BindTexture external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:354 ((GLenum target = 0xDE1, GLuint texture = 3))
01-01 04:50:52.452  8547  8615 E libGLESv2_swiftshader: Uniform1iv external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:5545 ((GLint location = 2, GLsizei count = 1, const GLint* v = 0x7b00d3a1621c))
01-01 04:50:52.452  8547  8615 E libGLESv2_swiftshader: VertexAttribPointer external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:6127 ((GLuint index = 0, GLint size = 3, GLenum type = 0x1406, GLboolean normalized = 0, GLsizei stride = 0, const GLvoid* ptr = 0x740e1d10))
01-01 04:50:52.452  3300  3300 E libGLESv2_swiftshader: BindTexture external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:354 ((GLenum target = 0x8D65, GLuint texture = 4))
01-01 04:50:52.452  3300  3300 E libGLESv2_swiftshader: EGLImageTargetTexture2DOES external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:6659 ((GLenum target = 0x8D65, GLeglImageOES image = 0x8))
01-01 04:50:52.452  8547  8615 E libGLESv2_swiftshader: EnableVertexAttribArray external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:1865 ((GLuint index = 0))
01-01 04:50:52.452  8547  8615 E libGLESv2_swiftshader: VertexAttribPointer external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:6127 ((GLuint index = 1, GLint size = 2, GLenum type = 0x1406, GLboolean normalized = 0, GLsizei stride = 0, const GLvoid* ptr = 0x740e0b80))
01-01 04:50:52.453  8547  8615 E libGLESv2_swiftshader: EnableVertexAttribArray external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:1865 ((GLuint index = 1))
01-01 04:50:52.453  8547  8615 E libGLESv2_swiftshader: DrawArrays external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:1537 ((GLenum mode = 0x5, GLint first = 0, GLsizei count = 4))
01-01 04:50:52.453  3300  3300 E libGLESv2_swiftshader: Disable external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:1493 ((GLenum cap = 0xC11))
01-01 04:50:52.454  3300  3300 E libGLESv2_swiftshader: Viewport external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:6189 ((GLint x = 0, GLint y = 0, GLsizei width = 1080, GLsizei height = 1920))
01-01 04:50:52.468  8547  8615 E libGLESv2_swiftshader: Finish external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:1922 (())
01-01 04:50:52.728  8547  8615 E libGLESv2_swiftshader: DisableVertexAttribArray external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:1520 ((GLuint index = 0))
01-01 04:50:52.729  8547  8615 E libGLESv2_swiftshader: DisableVertexAttribArray external/swiftshader/src/OpenGL/libGLESv2/libGLESv2.cpp:1520 ((GLuint index = 1))

再怀疑是因为texture没有被正确地送入到GPU中，通过dump glTexImage2D()输入的pixel，发现也是正确的。

最后在fragment shader中写hard code, 强制输出某种颜色，也是正确的，说明整个流程没有问题。

最后怀疑是texture的坐标没有正确地设置，在打印glVertexAttribPointer()的输入参数，发现这个时候的texture坐标都是0，明显不是正确的值，一步一步往上debug，发现是因为Java ByteBuffer的order没有被设置成本机字节顺序(ByteOrder.nativeOrder()), 设置了以后，texture的坐标就正确了。

解决了绿屏的问题以后，yuv420p的数据能在Android上正确显示了，显示效果如下

How swiftshader supports texture

发表于 2018-04-08 | 阅读次数

0x1 Introduction

Here is the OpenGL ES pipeline, the pink color box represents the data buffer in the pipeline, the gray blue box represents the operation in the pipeline.

In the following of this article, I will show you how texture is used in the OpenGL ES pipeline. In order to explains it more clearly, I will dig into the texture support in Swiftshader, since Swiftshader is the software implementation of OpenGL ES pipeline, we can see the detail implementation of it.

0x2 Application code.

Here is the code for texture generation.
It generates and binds texture, and sets the parameter for it.
GL_TEXTURE_MAG_FILTER and GL_TEXTURE_MIN_FILTER is used to set up the filtering mode for mipmaps. GL_TEXTURE_WRAP_S and GL_TEXTURE_WRAP_T is used to set up texture wrap modes when texture coordinate is outside the range[0.0, 1.0].

glGenTextures(1, mTexture, 0);
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, mTexture);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
glTexParameteri(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);

Here is the code for texture upload.
The texture’s content is passed by parameter pixel, the GPU driver will copy pixel into its managed internal buffer, If the GPU hardware supports tile rendering, typically there is a conversion from linear buffer to tile buffer at the same time.

glBindTexture(GL_TEXTURE_2D, mTexture);
glTexImage2D(GL_TEXTURE_2D,
            0,
            GL_LUMINANCE,
            width,
            height,
            0,
            GL_LUMINANCE,
            GL_UNSIGNED_BYTE,
            pixel);
glUniform1i(mTextureHandle, i);

0x3 Texture in Shader code.

Here is one example of glsl shader code.

static const char simplevs[] =
    attribute vec4 position;
    attribute vec2 texCoords;
    varying vec2 outTexCoords;
    void main(void) {
        outTexCoords = texCoords;
        gl_Position = position;
    }";
static const char simplefs[] =
    precision mediump float;
    varying vec2 outTexCoords;
    uniform sampler2D texture;
    void main(void) {
        gl_FragColor = texture2D(texture, outTexCoords);
    }

In the vertex shader code, the texture’s coordinate is input by attribute texCoords, then it is output in vertex shader as outTexCoords,
Then in primitive assembly and rasterization stage, it is clipped and interpolated.
Then in fragment shader code, the clipped and interpolated texture coordinate is used to fetching specific position’s pixel from the texture.
Here is the diagram to explain the execution sequence.

0x4 Texture support in swiftshader

Let’s to see how texture is supported in swiftshader, Here is the draft diagram in swiftshader about texture support.

0x41 Set texture for pipeline in glDrawXX()

At this time, texture is uploaded through glTexImage2D(), glDrawXX() is triggered. applyTexture() is used to setup texture for current draw context.

void Context::applyTexture(sw::SamplerType type, int index, Texture *baseTexture)
{
        ……
		switch(baseTexture->getTarget())
		{
		case GL_TEXTURE_2D:
		{
			Texture2D *texture = static_cast<Texture2D*>(baseTexture);
			for(int mipmapLevel = 0; mipmapLevel < sw::MIPMAP_LEVELS; mipmapLevel++)
			{
				int surfaceLevel = mipmapLevel + baseLevel;
				if(surfaceLevel > maxLevel)
				{
					surfaceLevel = maxLevel;
				}
				egl::Image *surface = texture->getImage(surfaceLevel);
				device->setTextureLevel(sampler, 0, mipmapLevel, surface, sw::TEXTURE_2D);
			}
		}
		break;
        ……
}

0x42 Calculate texture’s coordinate in VertexProcessor.

It generate LLVM IR for texture coordinate calculation in vertex shader code.

void VertexProgram::program(UInt& index)
{
    ……
    for(size_t i = 0; i < shader->getLength(); i++)
	{
		const Shader::Instruction *instruction = shader->getInstruction(i);
		Shader::Opcode opcode = instruction->opcode;
        ……
    }
    ……
}

0x43 Clip and interpolate texture coordinate in SetupProcessor()

void SetupRoutine::generate()
{
    // Culling
	if(solidTriangle)
	{
        Float A = (y2 - y0) * x1 + (y1 - y2) * x0 + (y0 - y1) * x2; // Area
		If(A == 0.0f)
		{
			Return(false);
		}
		Int w0w1w2 = *Pointer<Int>(v0 + pos * 16 + 12) ^
				 *Pointer<Int>(v1 + pos * 16 + 12) ^
				 *Pointer<Int>(v2 + pos * 16 + 12);
		A = IfThenElse(w0w1w2 < 0, -A, A);
        if(state.cullMode == CULL_CLOCKWISE)
		{
			If(A >= 0.0f) Return(false);
		}
		else if(state.cullMode == CULL_COUNTERCLOCKWISE)
		{
			If(A <= 0.0f) Return(false);
        }
    }
    
    ……
	Int n = *Pointer<Int>(polygon + OFFSET(Polygon,n));
	Int m = *Pointer<Int>(polygon + OFFSET(Polygon,i));
    If(m != 0 || Bool(!solidTriangle))// Clipped triangle
	{
        ……
        For(Int q = 0, q < state.multiSample, q++)
		{
            ……
			Xq[n] = Xq[0];
			Yq[n] = Yq[0];
			// Setup edge for Rasterize
			{
				Int i = 0;
                Do
				{
					edge(primitive, data, Xq[i + 1 - d], Yq[i + 1 - d], Xq[i + d], Yq[i + d], q);
					i++;
				}
				Until(i >= n)
			}
        }
    }
        
    ……
	if(state.interpolateW)
	{
        ……
	}
	if(state.interpolateZ)
	{
        ……
    }
	for(int interpolant = 0; interpolant < MAX_FRAGMENT_INPUTS; interpolant++)
	{
		for(int component = 0; component < 4; component++)
		{
            int attribute = state.gradient[interpolant][component].attribute;
                
			bool flat = state.gradient[interpolant][component].flat;
			bool wrap = state.gradient[interpolant][component].wrap;
			if(attribute != Unused)
			{
				setupGradient(primitive, tri, w012, M, v0, v1, v2, OFFSET(Vertex,v[attribute][component]), OFFSET(Primitive,V[interpolant][component]), flat, sprite, state.perspective, wrap, component);
			}
		}
    }
}

0x44 Pass parameter for pixel shader

The parameter includes texture sampler and texture’s coordinate. These parameter is encapsulated in DrawData.

struct DrawData
{
	const Constants *constants;
	const void *input[MAX_VERTEX_INPUTS];
	unsigned int stride[MAX_VERTEX_INPUTS];
	Texture mipmap[TOTAL_IMAGE_UNITS];
	const void *indices;
}
void Renderer::executeTask(int threadIndex)
{
	switch(task[threadIndex].type)
	{
	    case Task::PRIMITIVES:
            ……
			break;
		case Task::PIXELS:
			{
				int unit = task[threadIndex].primitiveUnit;
				int visible = primitiveProgress[unit].visible;
				if(visible > 0)
				{
					int cluster = task[threadIndex].pixelCluster;
					Primitive *primitive = primitiveBatch[unit];
					DrawCall *draw = drawList[pixelProgress[cluster].drawCall & DRAW_COUNT_BITS];
					DrawData *data = draw->data;
					PixelProcessor::RoutinePointer pixelRoutine = draw->pixelPointer;
					pixelRoutine(primitive, visible, cluster, data);
				}
				finishRendering(task[threadIndex]);
			}
			break;
        ……
    }
}
``` C++
## 0x45 Fetch pixel from texture
PixelProgram fetch pixel from texture by passing the clipped and interpolated texture coordinate. 
uvwq is the texture coordinate. the following code is executed in LLVM JIT.
``` C++
Vector4f PixelProgram::sampleTexture(int samplerIndex, Vector4f &uvwq, Float4 &bias, Vector4f &dsx, Vector4f &dsy, Vector4f &offset, SamplerFunction function)
{
	
	Pointer<Byte> texture = data + OFFSET(DrawData, mipmap) + samplerIndex * sizeof(Texture);
	Vector4f c = SamplerCore(constants, state.sampler[samplerIndex]).sampleTexture(texture, uvwq.x, uvwq.y, uvwq.z, uvwq.w, bias, dsx, dsy, offset, function);
	return c;
}

Here is the detail texture sample algorithm, from the following code, we can see SwiftShader uses Bilinear interpolation to sample the texture.

Vector4s SamplerCore::sampleQuad2D(Pointer<Byte> &texture, Float4 &u, Float4 &v, Float4 &w, Vector4f &offset, Float &lod, Int face[4], bool secondLOD, SamplerFunction function)
{
	// Bilinear interpolation
	if(componentCount >= 1)
	{
		if(has16bitTextureComponents() && hasUnsignedTextureComponent(0))
		{
			c0.x = As<UShort4>(c0.x) - MulHigh(As<UShort4>(c0.x), f0u) + MulHigh(As<UShort4>(c1.x), f0u);
			c2.x = As<UShort4>(c2.x) - MulHigh(As<UShort4>(c2.x), f0u) + MulHigh(As<UShort4>(c3.x), f0u);
			c.x  = As<UShort4>(c0.x) - MulHigh(As<UShort4>(c0.x), f0v) + MulHigh(As<UShort4>(c2.x), f0v);
		}
		else
		{
			if(hasUnsignedTextureComponent(0))
			{
				c0.x = MulHigh(As<UShort4>(c0.x), f1u1v);
				c1.x = MulHigh(As<UShort4>(c1.x), f0u1v);
				c2.x = MulHigh(As<UShort4>(c2.x), f1u0v);
				c3.x = MulHigh(As<UShort4>(c3.x), f0u0v);
			}
			else
			{
				c0.x = MulHigh(c0.x, f1u1vs);
				c1.x = MulHigh(c1.x, f0u1vs);
				c2.x = MulHigh(c2.x, f1u0vs);
				c3.x = MulHigh(c3.x, f0u0vs);
			}
            c.x = (c0.x + c1.x) + (c2.x + c3.x);
			if(!hasUnsignedTextureComponent(0)) c.x = AddSat(c.x, c.x);// Correct for signed fractions
	    }
    }
}

learning rate in neural network

发表于 2018-04-01 | 阅读次数

0x1 学习率简介

学习率是一个重要的超参数，在大多数神经网络优化算法（如SGD、Adam）中都会用到，它控制着我们基于损失梯度调整神经网络权值的速度。如果学习率过大，可能会错过最小值，并且可能造成不能收敛，损失在某一个值附近反复震荡。如果学习率过小，我们沿着损失梯度下降的速度就很慢，收敛的速度也很慢。

神经网络优化算法的功能是，控制方差，寻找最小值，更新模型参数，最终使模型收敛。
神经网络更新参数的公式为：θ=θ−η×∇(θ).J(θ)，其中η是学习率，∇(θ).J(θ)是损失函数J(θ)的梯度。

学习率是神经网络中难以设置的超参数之一，它对模型的性能有很大的影响。目前出现了很多自适应学习率算法，如下文测试中用到的Adam。在下面的测试中发现，虽然Adam会在优化的过程中动态调整学习率，初始的学习率设置还是会对模型的准确率有很大影响。

0x2 构造计算图

用python编写tensorflow测试代码，构造如下图所示的卷积神经网络，用来对MNIST数据集进行0～9分类。

从下图神经网络图可知，首先载入MNIST数据，然后进行处理，包括卷积层layer1, 卷积层layer2, 接下来是一个全连接层fc_layer, 接下来是dropout层，最后连接上softmax层layer3, 得到概率输出。

优化算法采用Adam，在设置Adam优化器的时候设置需要初始学习率，其代码如下。

1	train_step = tf.train.AdamOptimizer(learning_rate).minimize(cross_entropy)

总体网络图如下

layer1内部结构如下所示
首先初始化权重和偏置参数。然后使用conv2d进行卷积操作并加上偏置，卷积参数为[5, 5, 1, 32], 代表卷积核的大小为5x5, 1个颜色通道，32个不同的卷积核。然后使用relu激活函数进行非线性化处理。然后使用池化函数对卷积的结果进行1/2下采样处理，MNIST数据中图片大小为28x28，经过下采样以后变为14x14。

layer2内部结构如下所示
和layer1结构类似，只是卷积参数设置为[5, 5, 32, 64]，因为layer1中经过32个不同卷积核的处理，所以每个数据的通道变成了32，另外卷积核的数量变成64，经过池化函数以后，数据由14x14变成7x7。

后面再连接一个全连接层fc_layer，输出为1024个隐含节点，并使用relu激活函数。

下面是dropout层，使用keep_prob参数来控制以减轻过拟合。

接下来是layer3层，首先把1024个隐含节点通过矩阵变换转换成10个节点，矩阵变换的参数是权重参数和偏置参数，然后再连接softmax函数，得到对应数字0~9的概率输出。

优化器是Adam，该优化器对定义好的损失函数进行优化。

0x3 高学习率的测试结果

这个时候初始学习率被设置为0.5，从训练集上的测试准确率上来看，准确率是很低的，在0.05～0.15之间来回震荡。

0x4 低学习率的测试结果

这个时候初始学习率被设置为0.001，从训练集上的测试准确率上来看，准确率是慢慢升高的，最后稳定在0.95以上。

0x5 Adam优化器

通过gdb调试可知，tensorflow中Adam优化器调用堆栈如下。

(gdb) bt
#0  tensorflow::functor::ApplyAdamNonCuda<Eigen::ThreadPoolDevice, float>::operator()(Eigen::ThreadPoolDevice const&, Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorMap<Eigen::TensorFixedSize<float const, Eigen::Sizes<>, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorMap<Eigen::TensorFixedSize<float const, Eigen::Sizes<>, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorMap<Eigen::TensorFixedSize<float const, Eigen::Sizes<>, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorMap<Eigen::TensorFixedSize<float const, Eigen::Sizes<>, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorMap<Eigen::TensorFixedSize<float const, Eigen::Sizes<>, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorMap<Eigen::TensorFixedSize<float const, Eigen::Sizes<>, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer>, bool) (this=0x7fffc58d8260, d=..., var=..., m=..., v=..., beta1_power=..., 
    beta2_power=..., lr=..., beta1=..., beta2=..., epsilon=..., grad=..., use_nesterov=false) at tensorflow/core/kernels/training_ops.cc:293
#1  0x00007fffede52e6d in tensorflow::ApplyAdamOp<Eigen::ThreadPoolDevice, float>::Compute (this=0x555558f61cf0, ctx=0x7fffc58d8840)
    at tensorflow/core/kernels/training_ops.cc:2523
#2  0x00007fffe6a2e91b in tensorflow::ThreadPoolDevice::Compute (this=0x555557f25fa0, op_kernel=0x555558f61cf0, context=0x7fffc58d8840)
    at tensorflow/core/common_runtime/threadpool_device.cc:59
#3  0x00007fffe69c9c0a in tensorflow::(anonymous namespace)::ExecutorState::Process (this=0x55555945b6c0, tagged_node=..., scheduled_usec=0)
    at tensorflow/core/common_runtime/executor.cc:1652
#4  0x00007fffe69d81a7 in std::_Mem_fn_base<void (tensorflow::(anonymous namespace)::ExecutorState::*)(tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long), true>::operator()<tensorflow::(anonymous namespace)::ExecutorState::TaggedNode&, long long&, void> (
    this=0x7fff8c04b440, __object=0x55555945b6c0) at /usr/include/c++/5/functional:600
#5  0x00007fffe69d7c62 in std::_Bind<std::_Mem_fn<void (tensorflow::(anonymous namespace)::ExecutorState::*)(tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>(tensorflow::(anonymous namespace)::ExecutorState*, tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>::__call<void, 0ul, 1ul, 2ul>(<unknown type in /home/kevin/anaconda3/lib/python3.6/site-packages/tensorflow/python/../libtensorflow_framework.so, CU 0x314f884, DIE 0x31e50c1>, std::_Index_tuple<0ul, 1ul, 2ul>) (this=0x7fff8c04b440, 
    __args=<unknown type in /home/kevin/anaconda3/lib/python3.6/site-packages/tensorflow/python/../libtensorflow_framework.so, CU 0x314f884, DIE 0x31e50c1>) at /usr/include/c++/5/functional:1074
#6  0x00007fffe69d5a06 in std::_Bind<std::_Mem_fn<void (tensorflow::(anonymous namespace)::ExecutorState::*)(tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>(tensorflow::(anonymous namespace)::ExecutorState*, tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>::operator()<, void>(void) (this=0x7fff8c04b440) at /usr/include/c++/5/functional:1133
#7  0x00007fffe69d32ac in std::_Function_handler<void(), std::_Bind<std::_Mem_fn<void (tensorflow::(anonymous namespace)::ExecutorState::*)(tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)>(tensorflow::(anonymous namespace)::ExecutorState*, tensorflow::(anonymous namespace)::ExecutorState::TaggedNode, long long int)> >::_M_invoke(const std::_Any_data &) (__functor=...)
    at /usr/include/c++/5/functional:1871
#8  0x00007fffe6297984 in std::function<void ()>::operator()() const (this=0x7fff8c04a930) at /usr/include/c++/5/functional:2267
#9  0x00007fffe64afb9e in tensorflow::thread::EigenEnvironment::ExecuteTask (this=0x555557f5ca48, t=...)
    at tensorflow/core/lib/core/threadpool.cc:81
#10 0x00007fffe64b265c in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::WorkerLoop (this=0x555557f5ca40, 
    thread_id=1) at external/eigen_archive/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h:232
#11 0x00007fffe64b0aae in Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::NonBlockingThreadPoolTempl(int, bool, tensorflow::thread::EigenEnvironment)::{lambda()#1}::operator()() const ()
    at external/eigen_archive/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h:65
#12 0x00007fffe64b3c7c in std::_Function_handler<void (), Eigen::NonBlockingThreadPoolTempl<tensorflow::thread::EigenEnvironment>::NonBlockingThreadPoolTempl(int, bool, tensorflow::thread::EigenEnvironment)::{lambda()#1}>::_M_invoke(std::_Any_data const&) (__functor=...)
    at /usr/include/c++/5/functional:1871
#13 0x00007fffe6297984 in std::function<void ()>::operator()() const (this=0x555557f6d0b0) at /usr/include/c++/5/functional:2267
#14 0x00007fffe64af907 in tensorflow::thread::EigenEnvironment::CreateThread(std::function<void ()>)::{lambda()#1}::operator()() const (
    __closure=0x555557f6d0b0) at tensorflow/core/lib/core/threadpool.cc:56
#15 0x00007fffe64b18d8 in std::_Function_handler<void (), tensorflow::thread::EigenEnvironment::CreateThread(std::function<void ()>)::{lambda()#1}>::_M_invoke(std::_Any_data const&) (__functor=...) at /usr/include/c++/5/functional:1871
#16 0x00007fffe6297984 in std::function<void ()>::operator()() const (this=0x555557f6d108) at /usr/include/c++/5/functional:2267
#17 0x00007fffe64f4f38 in std::_Bind_simple<std::function<void ()> ()>::_M_invoke<>(std::_Index_tuple<>) (this=0x555557f6d108)
    at /usr/include/c++/5/functional:1531
#18 0x00007fffe64f4ea1 in std::_Bind_simple<std::function<void ()> ()>::operator()() (this=0x555557f6d108)
    at /usr/include/c++/5/functional:1520
#19 0x00007fffe64f4e40 in std::thread::_Impl<std::_Bind_simple<std::function<void ()> ()> >::_M_run() (this=0x555557f6d0f0)
    at /usr/include/c++/5/thread:115
#20 0x00007fffe5067c80 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6
#21 0x00007ffff7bc16ba in start_thread (arg=0x7fffc58d9700) at pthread_create.c:333
#22 0x00007ffff78f741d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

tensorflow中Adam优化器更新参数的代码如下。
tensorflow/core/kernels/training_ops.cc

template <typename Device, typename T>
struct ApplyAdamNonCuda {
  void operator()(const Device& d, typename TTypes<T>::Flat var,
                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
                  typename TTypes<T>::ConstScalar beta1_power,
                  typename TTypes<T>::ConstScalar beta2_power,
                  typename TTypes<T>::ConstScalar lr,
                  typename TTypes<T>::ConstScalar beta1,
                  typename TTypes<T>::ConstScalar beta2,
                  typename TTypes<T>::ConstScalar epsilon,
                  typename TTypes<T>::ConstFlat grad, bool use_nesterov) {
    const T alpha = lr() * Eigen::numext::sqrt(T(1) - beta2_power()) /
                    (T(1) - beta1_power());
    // beta1 == μ
    // beta2 == ν
    // v     == n
    // var   == θ
    m.device(d) += (grad - m) * (T(1) - beta1());
    v.device(d) += (grad.square() - v) * (T(1) - beta2());
    if (use_nesterov) {
      var.device(d) -= ((grad * (T(1) - beta1()) + beta1() * m) * alpha) /
                       (v.sqrt() + epsilon());
    } else {
      var.device(d) -= (m * alpha) / (v.sqrt() + epsilon());
    }
  }
};

算法描述如下
如下所示，虽然在Adam算法中，学习率lr_t是动态调整的，但是也是和初始设置的学习率learning_rate有关，如果learning_rate设置不当，也会影响训练模型的收敛。

t <- t + 1
lr_t <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)//计算动态学习率
m_t <- beta1 * m_{t-1} + (1 - beta1) * g
v_t <- beta2 * v_{t-1} + (1 - beta2) * g * g
variable <- variable - lr_t * m_t / (sqrt(v_t) + epsilon)//更新参数

douyin streaming analysis

发表于 2018-03-25 | 阅读次数

0x1 简介

抖音是最近火热的短视频app, 本文通过抓包来分析douyin的在线视频播放过程。
通过分析，可以知道其播放流程如下，

server是指保存用户上传内容(UGC)的服务器，客户端连接到该服务器下载需要播放的内容。

其它三个模块(downloader, local streamer, local player)都是在客户端实现的, 其中downloader负责把数据从远端server下载下来，并把下载下来的数据送给local streamer进行本地streaming, 然后local player连接到local streamer来得到需要播放的数据进行播放。通过下面的抓包分析可知，downloader是通过http协议来从远端server下载数据，local streamer也是通过提供本地http服务把播放数据提供给local player.

下面分析android平台上douyin下载数据并播放的过程。

0x2 下载

0x21 dns查询服务器的ip地址

通过dns查询v1-dy.ixiguavideo.com的ip地址，这一步是为后面的http连接服务的。
如下图所示，第49号包是发送dns查询请求，第56号包是返回dns查询结果。

dns查询结果如下，可以看到对应服务器的ip地址, 其中返回好几个ip地址，下面的http连接采用的是第一个ip地址。

0x22 http连接到服务器并下载数据

如下所示，通过http连接到服务器。
可以看到http url如下, 这个地址是经过加密的，直接在浏览器中去访问是连接不上去的。
http://v1-dy.ixiguavideo.com/9980b014844ac5185d899a1af1e41554/5acf7425/video/m/220fea4bc73a1c54023a9819c71b52a9b3e115640520000a8c154141acc/?device_platform=android&device_type=myandroid&version_code=179

然后可以看到客户端不停地从服务器上下载数据。

0x3 播放

下载过程中，启动本地http服务, 播放器通过访问这个http服务来得到播放数据，
前面三个包(1130, 1131, 1132)是三次握手的过程。
后面的包是发送给播放器的播放数据。

下面是douyin app中的lib，从其中可以看到其播放架构是基于ffmpeg实现的。通过分析可知，这里面有好几个库都包括了ffmpeg的代码，感觉模块之间划分不是很清楚，代码有冗余。

libaudiocore.so             libdaemon.so          libijkffmpeg.so      libst_mobile.so     libttnativecrash.so
libaudiofp.so               libdys.so             libijkplayer.so      libstatic-webp.so   libttvideouploader.so
libbdEASRAndroid.v1.5.6.so  libeffect.so          libijksdl.so         libsupervisor.so    libuserinfo.so
libbspatch.so               libffmpeg.so          libimagepipeline.so  libtnet-3.1.11.so   libweibosdkcore.so
libBugly.so                 libffmpeg-invoker.so  liblivestream.so     libtongdun.so       libyuv.so
libcocklogic-1.1.3.so       libffmpeg-main.so     libmain.so           libttEncrypt.so
libcrashlytics.so           libgif.so             libNailSLAM_jni.so   libttmplayer.so
libcrashlytics-envelope.so  libgifimage.so        libSDL2.so           libttmplayer_mc.so

其中libeffect.so实现了对用户拍摄好的视频内容进行编辑，加特效等工作，如下所示，其中包括了CV相关的代码。

00493ecd T cvGetImage
00494511 T cvGetImageCOI
0049439d T cvGetImageROI
00491261 T cvGetMat
0048fe41 T cvGetND
0053d33d T cvGetNumThreads
004ba981 T cvGetOptimalDFTSize
0041f609 T cvGetPerspectiveTransform
0048f4f5 T cvGetRawData
0048fea1 T cvGetReal1D
004900d9 T cvGetReal2D
00545685 T cvGetRootFileNode
00490321 T cvGetReal3D

ijkplayer介绍

发表于 2018-03-19 | 阅读次数

0x1 系统架构

ijkplayer是由b站开源的播放器项目，底层基于ffmpeg, 支持Android和iOS。下面我们来简单介绍一下Android上的实现。
Android上的系统架构图如下。

下面分别对各个模块进行介绍。
ijkplayer-example是app的实现，主要是ui逻辑的实现，包括activity的实现，ui控件的组织，窗口的定制，数据的存储。
ijkplayer-example通过调用ijkmediaplayer，android mediaplayer， google exoplayer这三种mediaplayer来实现媒体播放。
ijkplayer-java是对底层实现的ijkmediaplayer和android mediaplayer的java封装，对ijkmediaplayer的封装是通过调用底层jni对应的java接口，对android mediaplayer的封装是调用android系统实现的默认mediaplayer接口。
ijkplayer-exo是对google exoplayer的封装，除了android默认的播放器之外，ExoPlayer是Google提供的在android平台上的另外一种播放器。
libijkplayer提供了ijkmediaplayer的jni实现ijkplayer_jni.c，然后调用封装过的ffplayer.c, 再调用底层实现的解码库libijkffmpeg和显示库libijksdl。
libijkffmpeg实现了媒体文件的demux, decode等功能。
libijksdl实现对解码后的数据进行显示。

0x2 Java层关键模块分析

0x21 三种不同的mediaplayer实现.

这三种mediaplayer的类图如下图所示。

AndroidMediaPlayer是对Andoid默认播放器的封装。
IjkMediaPlayer是基于ffmpeg的播放器实现。
IjkExoMediaPlayer是基于Goodle开源的ExoPlayer的封装。

0x22 设置不同的Render

不同Render的类图如下图所示。

SurfaceRenderView是基于SurfaceView的显示实现。
TextureRenderView是基于TextureView的显示实现。
上述两种显示实现方式都实现了接口IRenderView。

0x23 IjkMediaPlayer的JNI接口

详细的JNI接口如下图所示。

这些JNI接口提供了播放器的基本接口，包括播放准备(_setDataSource, _setVideoSurface, setVolume，_prepareAsync), 播放控制(_start, _stop, seekTo，_release， _reset)等功能。

0x3 关键流程

0x31 设置surface

设置surface的流程图如下图所示。

从上面的流程图可知，通过接口_setVideoSurface(), 把UI层的Surface对象(可以理解为显示窗口)设置给SDL显示对象，作为其显示窗口(native_window), 这样SDL有需要显示的内容可以直接在这个显示窗口上显示输出即可。

0x32 显示流程

显示的流程图如下图所示。

下面对上面的流程图简单说明。
解码线程ffp_video_thread解码完成以后，调用接口queue_picture把需要显示的buffer往SDL模块发送。
然后调用func_fill_frame()填充显示buffer, 这个时候需要判断是通过ffmpeg还是mediacodec实现的解码。
如果是ffmpeg实现的话则调用ijksdl_vout_overlay_ffmpeg.c中的函数func_fill_frame()。
否则调用ijksdl_vout_overlay_android_mediacodec.c中的func_fill_frame()。
然后显示线程video_refresh_thread就可以开始显示了。如果是GPU支持的格式则调用GPU进行输出，这个时候调用的是IJK_EGL_display，否则调用ANativeWindow_lock和ANativeWindow_unlockAndPost进行输出。

How renderscript supports parallel compution

发表于 2018-03-13 | 阅读次数

0x1 Architecture

RenderScript is a framework for running computationally intensive tasks at high performance on Android. It is similar to OpenCL which is cross platform spec for parallel computation. RenderScript is primarily oriented for use with data-parallel computation. The RenderScript runtime will parallelize work across all processors available on a device, such as multi-core CPUs, GPUs, or DSPs. RenderScript is especially useful for applications performing image processing, computational photography, or computer vision. Android’s doc

Here is the architecture of RenderScript.

RS Wrapper acts as the wrapper layer for RenderScript, provides the RenderScript api mapping and resource management.
cpu ref is the software implementation of RenderScript on CPU.
gpu rs is the hardware implementation of RenderScript on GPU.
slang/llvm provides the front and backend compiler support for RenderScript’s C99-derived language.

In this article, we will discuss how software RenderScript is suported on multi-core CPU.

0x2 Software Implementation

0x21 Create Threads

Create threads for parallel computing based on cpu core’s number

bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
                               sym_lookup_t lfn, script_lookup_t slfn) {
    … …
    GetCpuInfo();
    int cpu = sysconf(_SC_NPROCESSORS_CONF);
    if(mRSC->props.mDebugMaxThreads) {
        cpu = mRSC->props.mDebugMaxThreads;
    }
    if (cpu < 2) {
        mWorkers.mCount = 0;
        return true;
    }
    // Subtract one from the cpu count because we also use the command thread as a worker.
    mWorkers.mCount = (uint32_t)(cpu - 1);
    for (uint32_t ct=0; ct < mWorkers.mCount; ct++) {
        status = pthread_create(&mWorkers.mThreadId[ct], &threadAttr, helperThreadProc, this);
        if (status) {
            mWorkers.mCount = ct;
            ALOGE("Created fewer than expected number of RS threads.");
            break;
        }
    }

0x22 Thread implementation

Here is the thread’s source code.

void * RsdCpuReferenceImpl::helperThreadProc(void *vrsc) {
    RsdCpuReferenceImpl *dc = (RsdCpuReferenceImpl *)vrsc;
    uint32_t idx = __sync_fetch_and_add(&dc->mWorkers.mLaunchCount, 1);
    dc->mWorkers.mLaunchSignals[idx].init();
    dc->mWorkers.mNativeThreadId[idx] = gettid();
    memset(&dc->mTlsStruct, 0, sizeof(dc->mTlsStruct));
    int status = pthread_setspecific(gThreadTLSKey, &dc->mTlsStruct);
    if (status) {
        ALOGE("pthread_setspecific %i", status);
    }
    while (!dc->mExit) {
        dc->mWorkers.mLaunchSignals[idx].wait();
        if (dc->mWorkers.mLaunchCallback) {
           // idx +1 is used because the calling thread is always worker 0.
           dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1);
        }
        __sync_fetch_and_sub(&dc->mWorkers.mRunningCount, 1);
        dc->mWorkers.mCompleteSignal.set();
    }
    return nullptr;
}

dc->mWorkers.mLaunchSignals[idx].wait() is used to wait for the task to be processed.

dc->mWorkers.mLaunchCallback will call the actual processing routine.

dc->mWorkers.mCompleteSignal.set() is used to indicate the processing is complete.

0x23 Launch Thread

dc->mWorkers.mLaunchSignals[idx].wait() is signed in RsdCpuReferenceImpl::launchThreads().

And in RsdCpuReferenceImpl::launchThreads(), we can see mWorkers.mCompleteSignal.wait() is used to wait for the finish of the executing threads.

And the ‘WorkerCallback_t cbk’ is passed in to do the actual processing.

void RsdCpuReferenceImpl::launchThreads(WorkerCallback_t cbk, void *data) {
    mWorkers.mLaunchData = data;
    mWorkers.mLaunchCallback = cbk;
    for (uint32_t ct = 0; ct < mWorkers.mCount; ct++) {
        mWorkers.mLaunchSignals[ct].set();
    }
    // We use the calling thread as one of the workers so we can start without
    // the delay of the thread wakeup.
    if (mWorkers.mLaunchCallback) {
        mWorkers.mLaunchCallback(mWorkers.mLaunchData, 0);
    }
    while (__sync_fetch_and_or(&mWorkers.mRunningCount, 0) != 0) {
        mWorkers.mCompleteSignal.wait();
    }
}

Here is the code about how launchThreads is used.

Line 767:             launchThreads(walk_3d_reduce, mtls);
Line 770:             launchThreads(walk_2d_reduce, mtls);
Line 773:             launchThreads(walk_1d_reduce, mtls);
Line 851:             launchThreads(walk_general_foreach, mtls);
Line 873:             launchThreads(walk_2d_foreach, mtls);
Line 895:             launchThreads(walk_1d_foreach, mtls);

0x24 Thread Execution

Each thread setups the task according to the current mSliceNum,
Then it setups yStart and yEnd, then executes kernel from yStart to yEnd.
The kernel is set in RsdCpuScriptIntrinsic::invokeForEach()
or RsdCpuScriptImpl::forEachKernelSetup.

RsdCpuScriptIntrinsic::invokeForEach() is used to setup kernel for Intrinsic.

RsdCpuScriptImpl::forEachKernelSetup() is used to setup user defined kernel in *.rs files.

static void walk_2d_foreach(void *usr, uint32_t idx) {
    MTLaunchStructForEach *mtls = (MTLaunchStructForEach *)usr;
    RsExpandKernelDriverInfo fep = mtls->fep;
    fep.lid = idx;
    ForEachFunc_t fn = mtls->kernel;
    while (1) {
        uint32_t slice  = (uint32_t)__sync_fetch_and_add(&mtls->mSliceNum, 1);
        uint32_t yStart = mtls->start.y + slice * mtls->mSliceSize;
        uint32_t yEnd   = yStart + mtls->mSliceSize;
        yEnd = rsMin(yEnd, mtls->end.y);
        if (yEnd <= yStart) {
            return;
        }
        for (fep.current.y = yStart; fep.current.y < yEnd; fep.current.y++) {
            FepPtrSetup(mtls, &fep, mtls->start.x, fep.current.y);
            fn(&fep, mtls->start.x, mtls->end.x, fep.outStride[0]);
        }
    }
}

0x25 Kernel implementation

Let’s use IntrinsicBlur as the example, in its kernel function kernelU1(), it will produce output pixel from xstart to xend.
the algorithm is based on Gaussian Weights which are initialized in ComputeGaussianWeights().

void RsdCpuScriptIntrinsicBlur::kernelU1(const RsExpandKernelDriverInfo *info,
                                         uint32_t xstart, uint32_t xend,
                                         uint32_t outstep) {
    float buf[4 * 2048];
    RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)info->usr;
    if (!cp->mAlloc.get()) {
        ALOGE("Blur executed without input, skipping");
        return;
    }
    const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
    const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
    uchar *out = (uchar *)info->outPtr[0];
    uint32_t x1 = xstart;
    uint32_t x2 = xend;
    float *fout = (float *)buf;
    int y = info->current.y;
    if ((y > cp->mIradius) && (y < ((int)info->dim.y - cp->mIradius -1))) {
        const uchar *pi = pin + (y - cp->mIradius) * stride;
        OneVFU1(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, 0, info->dim.x);
    } else {
        x1 = 0;
        while(info->dim.x > x1) {
            OneVU1(info, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
            fout++;
            x1++;
        }
    }
    x1 = xstart;
    while ((x1 < x2) &&
           ((x1 < (uint32_t)cp->mIradius) || (((uintptr_t)out) & 0x3))) {
        OneHU1(info, out, x1, buf, cp->mFp, cp->mIradius);
        out++;
        x1++;
    }
    while(x2 > x1) {
        OneHU1(info, out, x1, buf, cp->mFp, cp->mIradius);
        out++;
        x1++;
    }
}

Analysis of SwiftShader

发表于 2018-02-25 | 阅读次数

0x1 Architecture

From the doc of swiftshader in the following link, we can see the architecture introduction of it.
https://github.com/google/swiftshader/blob/master/docs/Index.md

The API layer is an implementation of a graphics API, such as OpenGL (ES) or Direct3D, on top of the Renderer interface. It is responsible for managing API-level resources and rendering state, as well as compiling high-level shaders to bytecode form.

The Renderer layer generates specialized processing routines for draw calls and coordinates the execution of rendering tasks. It defines the data structures used and how the processing is performed.

Reactor is an embedded language for C++ to dynamically generate code in a WYSIWYG fashion. It allows to specialize the processing routines for the state and shaders used by each draw call. Its syntax closely resembles C and shading languages, to make the code generation easily readable.

The JIT layer is a run-time compiler, such as LLVM’s JIT, or Subzero. Reactor records its operations in an in-memory intermediate form which can be materialized by the JIT into a function which can be called directly.

To achieve exceptional performance, SwiftShader is built around two major optimizations that affect its architecture: dynamic code generation, and parallel processing. Generating code at run-time allows to eliminate code branches and optimizes register usage, specializing the processing routines for exactly the operations required by each draw call. Parallel processing means both utilizing the CPU’s multiple cores and processing multiple elements accoss the width of the SIMD vector units.

0x2 Graphics Pipeline

Here is the graphics pipeline diagram from the following OpenGL ES spec. SwiftShader supports the pipeline by JIT through LLVM then executes it on CPU.
https://www.khronos.org/registry/OpenGL/specs/es/2.0/es_full_spec_2.0.pdf

0x3 JIT through LLVM

0x31 GLSL compiler frontend

SwiftShader uses glslang as its glsl compiler frontend, it consumes glsl source code, and produces AST, then outputs its IR with recursive traverse method, here is the IR definition.

enum Opcode
{
	// Matches order in d3d9types.h
	OPCODE_NOP = 0,
	OPCODE_MOV,
	OPCODE_ADD,
	OPCODE_SUB,
	OPCODE_MAD,
	OPCODE_MUL,
	OPCODE_RCPX,
	OPCODE_RSQX,
	OPCODE_DP3,
	OPCODE_DP4,
	OPCODE_MIN,
	OPCODE_MAX,
	OPCODE_SLT,
	OPCODE_SGE,
	OPCODE_EXP2X,   // D3DSIO_EXP
	OPCODE_LOG2X,   // D3DSIO_LOG
	OPCODE_LIT,
	OPCODE_ATT,   // D3DSIO_DST
	OPCODE_LRP,
	OPCODE_FRC,
	OPCODE_M4X4,
	OPCODE_M4X3,
	OPCODE_M3X4,
	OPCODE_M3X3,
	OPCODE_M3X2,
	OPCODE_CALL,
	OPCODE_CALLNZ,
	OPCODE_LOOP,
	OPCODE_RET,
	OPCODE_ENDLOOP,
	OPCODE_LABEL,
	OPCODE_DCL,
	OPCODE_POWX,
	OPCODE_CRS,
	OPCODE_SGN,
	OPCODE_ABS,
	OPCODE_NRM3,   // D3DSIO_NRM
	OPCODE_SINCOS,
	OPCODE_REP,
	OPCODE_ENDREP,
	OPCODE_IF,
	OPCODE_IFC,
	OPCODE_ELSE,
	OPCODE_ENDIF,
	OPCODE_BREAK,
	OPCODE_BREAKC,
	OPCODE_MOVA,
	OPCODE_DEFB,
	OPCODE_DEFI,
	OPCODE_TEXCOORD = 64,
	OPCODE_TEXKILL,
	OPCODE_TEX,
	OPCODE_TEXBEM,
	OPCODE_TEXBEML,
	OPCODE_TEXREG2AR,
	OPCODE_TEXREG2GB,
	OPCODE_TEXM3X2PAD,
	OPCODE_TEXM3X2TEX,
	OPCODE_TEXM3X3PAD,
	OPCODE_TEXM3X3TEX,
	OPCODE_RESERVED0,
	OPCODE_TEXM3X3SPEC,
	OPCODE_TEXM3X3VSPEC,
	OPCODE_EXPP,
	OPCODE_LOGP,
	OPCODE_CND,
	OPCODE_DEF,
	OPCODE_TEXREG2RGB,
	OPCODE_TEXDP3TEX,
	OPCODE_TEXM3X2DEPTH,
	OPCODE_TEXDP3,
	OPCODE_TEXM3X3,
	OPCODE_TEXDEPTH,
	OPCODE_CMP0,   // D3DSIO_CMP
	OPCODE_BEM,
	OPCODE_DP2ADD,
	OPCODE_DFDX,   // D3DSIO_DSX
	OPCODE_DFDY,   // D3DSIO_DSY
	OPCODE_TEXLDD,
	OPCODE_CMP,   // D3DSIO_SETP
	OPCODE_TEXLDL,
	OPCODE_BREAKP,
	OPCODE_PHASE = 0xFFFD,
	OPCODE_COMMENT = 0xFFFE,
	OPCODE_END = 0xFFFF,
	OPCODE_PS_1_0 = 0xFFFF0100,
	OPCODE_PS_1_1 = 0xFFFF0101,
	OPCODE_PS_1_2 = 0xFFFF0102,
	OPCODE_PS_1_3 = 0xFFFF0103,
	OPCODE_PS_1_4 = 0xFFFF0104,
	OPCODE_PS_2_0 = 0xFFFF0200,
	OPCODE_PS_2_x = 0xFFFF0201,
	OPCODE_PS_3_0 = 0xFFFF0300,
	OPCODE_VS_1_0 = 0xFFFE0100,
	OPCODE_VS_1_1 = 0xFFFE0101,
	OPCODE_VS_2_0 = 0xFFFE0200,
	OPCODE_VS_2_x = 0xFFFE0201,
	OPCODE_VS_2_sw = 0xFFFE02FF,
	OPCODE_VS_3_0 = 0xFFFE0300,
	OPCODE_VS_3_sw = 0xFFFE03FF,
	OPCODE_NULL = 0x10000000,   // Dead instruction, to be eliminated
	OPCODE_WHILE,
	OPCODE_ENDWHILE,
	OPCODE_COS,
	OPCODE_SIN,
	OPCODE_TAN,
	OPCODE_ACOS,
	OPCODE_ASIN,
	OPCODE_ATAN,
	OPCODE_ATAN2,
	OPCODE_COSH,
	OPCODE_SINH,
	OPCODE_TANH,
	OPCODE_ACOSH,
	OPCODE_ASINH,
	OPCODE_ATANH,
	OPCODE_DP1,
	OPCODE_DP2,
	OPCODE_TRUNC,
	OPCODE_FLOOR,
	OPCODE_ROUND,
	OPCODE_ROUNDEVEN,
	OPCODE_CEIL,
	OPCODE_SQRT,
	OPCODE_RSQ,
	OPCODE_LEN2,
	OPCODE_LEN3,
	OPCODE_LEN4,
	OPCODE_DIST1,
	OPCODE_DIST2,
	OPCODE_DIST3,
	OPCODE_DIST4,
	OPCODE_NRM2,
	OPCODE_NRM4,
	OPCODE_DIV,
	OPCODE_MOD,
	OPCODE_EXP2,
	OPCODE_LOG2,
	OPCODE_EXP,
	OPCODE_LOG,
	OPCODE_POW,
	OPCODE_F2B,   // Float to bool
	OPCODE_B2F,   // Bool to float
	OPCODE_F2I,   // Float to int
	OPCODE_I2F,   // Int to float
	OPCODE_F2U,   // Float to uint
	OPCODE_U2F,   // Uint to float
	OPCODE_I2B,   // Int to bool
	OPCODE_B2I,   // Bool to int
	OPCODE_DET2,
	OPCODE_DET3,
	OPCODE_DET4,
	OPCODE_ALL,
	OPCODE_ANY,
	OPCODE_NEG,
	OPCODE_NOT,
	OPCODE_OR,
	OPCODE_XOR,
	OPCODE_AND,
	OPCODE_EQ,
	OPCODE_NE,
	OPCODE_STEP,
	OPCODE_SMOOTH,
	OPCODE_ISNAN,
	OPCODE_ISINF,
	OPCODE_TEXOFFSET,
	OPCODE_TEXLODOFFSET,
	OPCODE_TEXELFETCH,
	OPCODE_TEXELFETCHOFFSET,
	OPCODE_TEXGRAD,
	OPCODE_TEXGRADOFFSET,
	OPCODE_TEXBIAS,
	OPCODE_TEXLOD,
	OPCODE_TEXOFFSETBIAS,
	OPCODE_TEXRECT,
	OPCODE_TEXSIZE,
	OPCODE_FLOATBITSTOINT,
	OPCODE_FLOATBITSTOUINT,
	OPCODE_INTBITSTOFLOAT,
	OPCODE_UINTBITSTOFLOAT,
	OPCODE_PACKSNORM2x16,
	OPCODE_PACKUNORM2x16,
	OPCODE_PACKHALF2x16,
	OPCODE_UNPACKSNORM2x16,
	OPCODE_UNPACKUNORM2x16,
	OPCODE_UNPACKHALF2x16,
	OPCODE_FORWARD1,
	OPCODE_FORWARD2,
	OPCODE_FORWARD3,
	OPCODE_FORWARD4,
	OPCODE_REFLECT1,
	OPCODE_REFLECT2,
	OPCODE_REFLECT3,
	OPCODE_REFLECT4,
	OPCODE_REFRACT1,
	OPCODE_REFRACT2,
	OPCODE_REFRACT3,
	OPCODE_REFRACT4,
	OPCODE_ICMP,
	OPCODE_UCMP,
	OPCODE_SELECT,
	OPCODE_EXTRACT,
	OPCODE_INSERT,
	OPCODE_DISCARD,
	OPCODE_FWIDTH,
	OPCODE_LEAVE,   // Return before the end of the function
	OPCODE_CONTINUE,
	OPCODE_TEST,   // Marks the end of the code that can be skipped by 'continue'
	OPCODE_SWITCH,
	OPCODE_ENDSWITCH,
	// Integer opcodes
	OPCODE_INEG,
	OPCODE_IABS,
	OPCODE_ISGN,
	OPCODE_IADD,
	OPCODE_ISUB,
	OPCODE_IMUL,
	OPCODE_IDIV,
	OPCODE_IMAD,
	OPCODE_IMOD,
	OPCODE_SHL,
	OPCODE_ISHR,
	OPCODE_IMIN,
	OPCODE_IMAX,
	// Unsigned integer opcodes
	OPCODE_UDIV,
	OPCODE_UMOD,
	OPCODE_USHR,
	OPCODE_UMIN,
	OPCODE_UMAX,
};

Here is the callstack about how IR is generated, it is done by traversing the AST.

Once IR is ready, swiftshader will consume those IR and generate LLVM IR, some fixed pipeline and graphic state will also be programmed into LLVM IR at the same time.

We will discuss the three processors, vertex processor, setup processor and pixel processor, all the three processor will generate LLVM IR accoring to graphic state.

0x31 Vertex processor

Here is diagram about how vertex processor produces LLVM IR.

Prepare the draw state

It prepare thes draw state by updating the structe State according to the graphic state.

const VertexProcessor::State VertexProcessor::update(DrawType drawType)
{
		if(isFixedFunction())
		{
			updateTransform();
			if(updateLighting)
			{
				for(int i = 0; i < 8; i++)
				{
					if(context->vertexLightActive(i))
					{
						// Light position in camera coordinates
						setLightViewPosition(i, B * V * context->getLightPosition(i));
					}
				}
				updateLighting = false;
			}
		}
		State state;
		if(context->vertexShader)
		{
			state.shaderID = context->vertexShader->getSerialID();
		}
		else
		{
			state.shaderID = 0;
		}
		state.fixedFunction = !context->vertexShader && context->pixelShaderModel() < 0x0300;
		state.textureSampling = context->vertexShader ? context->vertexShader->containsTextureSampling() : false;
		state.positionRegister = context->vertexShader ? context->vertexShader->getPositionRegister() : Pos;
		state.pointSizeRegister = context->vertexShader ? context->vertexShader->getPointSizeRegister() : Pts;
		state.vertexBlendMatrixCount = context->vertexBlendMatrixCountActive();
		state.indexedVertexBlendEnable = context->indexedVertexBlendActive();
		state.vertexNormalActive = context->vertexNormalActive();
		state.normalizeNormals = context->normalizeNormalsActive();
		state.vertexLightingActive = context->vertexLightingActive();
		state.diffuseActive = context->diffuseActive();
		state.specularActive = context->specularActive();
		state.vertexSpecularActive = context->vertexSpecularActive();
		state.vertexLightActive = context->vertexLightActive(0) << 0 |
		                          context->vertexLightActive(1) << 1 |
		                          context->vertexLightActive(2) << 2 |
		                          context->vertexLightActive(3) << 3 |
		                          context->vertexLightActive(4) << 4 |
		                          context->vertexLightActive(5) << 5 |
		                          context->vertexLightActive(6) << 6 |
		                          context->vertexLightActive(7) << 7;
		state.vertexDiffuseMaterialSourceActive = context->vertexDiffuseMaterialSourceActive();
		state.vertexSpecularMaterialSourceActive = context->vertexSpecularMaterialSourceActive();
		state.vertexAmbientMaterialSourceActive = context->vertexAmbientMaterialSourceActive();
		state.vertexEmissiveMaterialSourceActive = context->vertexEmissiveMaterialSourceActive();
		state.fogActive = context->fogActive();
		state.vertexFogMode = context->vertexFogModeActive();
		state.rangeFogActive = context->rangeFogActive();
		state.localViewerActive = context->localViewerActive();
		state.pointSizeActive = context->pointSizeActive();
		state.pointScaleActive = context->pointScaleActive();
		state.preTransformed = context->preTransformed;
		state.superSampling = context->getSuperSampleCount() > 1;
		state.multiSampling = context->getMultiSampleCount() > 1;
		state.transformFeedbackQueryEnabled = context->transformFeedbackQueryEnabled;
		state.transformFeedbackEnabled = context->transformFeedbackEnabled;
		// Note: Quads aren't handled for verticesPerPrimitive, but verticesPerPrimitive is used for transform feedback,
		//       which is an OpenGL ES 3.0 feature, and OpenGL ES 3.0 doesn't support quads as a primitive type.
		DrawType type = static_cast<DrawType>(static_cast<unsigned int>(drawType) & 0xF);
		state.verticesPerPrimitive = 1 + (type >= DRAW_LINELIST) + (type >= DRAW_TRIANGLELIST);
		for(int i = 0; i < MAX_VERTEX_INPUTS; i++)
		{
			state.input[i].type = context->input[i].type;
			state.input[i].count = context->input[i].count;
			state.input[i].normalized = context->input[i].normalized;
			state.input[i].attribType = context->vertexShader ? context->vertexShader->getAttribType(i) : VertexShader::ATTRIBTYPE_FLOAT;
		}
		if(!context->vertexShader)
		{
			for(int i = 0; i < 8; i++)
			{
			//	state.textureState[i].vertexTextureActive = context->vertexTextureActive(i, 0);
				state.textureState[i].texGenActive = context->texGenActive(i);
				state.textureState[i].textureTransformCountActive = context->textureTransformCountActive(i);
				state.textureState[i].texCoordIndexActive = context->texCoordIndexActive(i);
			}
		}
		else
		{
			for(unsigned int i = 0; i < VERTEX_TEXTURE_IMAGE_UNITS; i++)
			{
				if(context->vertexShader->usesSampler(i))
				{
					state.sampler[i] = context->sampler[TEXTURE_IMAGE_UNITS + i].samplerState();
				}
			}
		}
		if(context->vertexShader)   // FIXME: Also when pre-transformed?
		{
			for(int i = 0; i < MAX_VERTEX_OUTPUTS; i++)
			{
				state.output[i].xWrite = context->vertexShader->getOutput(i, 0).active();
				state.output[i].yWrite = context->vertexShader->getOutput(i, 1).active();
				state.output[i].zWrite = context->vertexShader->getOutput(i, 2).active();
				state.output[i].wWrite = context->vertexShader->getOutput(i, 3).active();
			}
		}
		else if(!context->preTransformed || context->pixelShaderModel() < 0x0300)
		{
			state.output[Pos].write = 0xF;
			if(context->diffuseActive() && (context->lightingEnable || context->input[Color0]))
			{
				state.output[C0].write = 0xF;
			}
			if(context->specularActive())
			{
				state.output[C1].write = 0xF;
			}
			for(int stage = 0; stage < 8; stage++)
			{
				if(context->texCoordActive(stage, 0)) state.output[T0 + stage].write |= 0x01;
				if(context->texCoordActive(stage, 1)) state.output[T0 + stage].write |= 0x02;
				if(context->texCoordActive(stage, 2)) state.output[T0 + stage].write |= 0x04;
				if(context->texCoordActive(stage, 3)) state.output[T0 + stage].write |= 0x08;
			}
			if(context->fogActive())
			{
				state.output[Fog].xWrite = true;
			}
			if(context->pointSizeActive())
			{
				state.output[Pts].yWrite = true;
			}
		}
		else
		{
			state.output[Pos].write = 0xF;
			for(int i = 0; i < 2; i++)
			{
				if(context->input[Color0 + i])
				{
					state.output[C0 + i].write = 0xF;
				}
			}
			for(int i = 0; i < 8; i++)
			{
				if(context->input[TexCoord0 + i])
				{
					state.output[T0 + i].write = 0xF;
				}
			}
			if(context->input[PointSize])
			{
				state.output[Pts].yWrite = true;
			}
		}
		if(context->vertexShaderModel() < 0x0300)
		{
			state.output[C0].clamp = 0xF;
			state.output[C1].clamp = 0xF;
			state.output[Fog].xClamp = true;
		}
		state.hash = state.computeHash();
		return state;
}

Generate LLVM IR

Then it generates the LLVM IR with reactor method based on the state which is updated in the previous step.

void VertexRoutine::generate()
{
		const bool textureSampling = state.textureSampling;
		Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
		Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
		Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
		UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
		UInt primitiveNumber = *Pointer<UInt>(task + OFFSET(VertexTask, primitiveStart));
		UInt indexInPrimitive = 0;
		constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
		Do
		{
			UInt index = *Pointer<UInt>(batch);
			UInt tagIndex = index & 0x0000003C;
			UInt indexQ = !textureSampling ? UInt(index & 0xFFFFFFFC) : index;   // FIXME: TEXLDL hack to have independent LODs, hurts performance.
			If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
			{
				*Pointer<UInt>(tagCache + tagIndex) = indexQ;
				readInput(indexQ);
				pipeline(indexQ);
				postTransform();
				computeClipFlags();
				Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
				writeCache(cacheLine0);
			}
			UInt cacheIndex = index & 0x0000003F;
			Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
			writeVertex(vertex, cacheLine);
			if(state.transformFeedbackEnabled != 0)
			{
				transformFeedback(vertex, primitiveNumber, indexInPrimitive);
				indexInPrimitive++;
				If(indexInPrimitive == 3)
				{
					primitiveNumber++;
					indexInPrimitive = 0;
				}
			}
			vertex += sizeof(Vertex);
			batch += sizeof(unsigned int);
			vertexCount--;
		}
		Until(vertexCount == 0)
		Return();
}

0x32 Setup processor

The LLVM IR generation process of setup processor is similar to Vertex processor’s.

0x33 Pixel processor

The LLVM IR generation process of pixel processor is similar to Vertex processor’s.

0x4 Multithread

Here is the timeline diagram about how multithread is supported in swiftshader.
It split the draw task into several batches, each batch is executed in one thread.vertex processor and setup processor is executed together, pixel processor is executed after that.

0x41 Task producer.

Task producer produces the task in the main thread, then push the task in the queue and waits for the consumer to get those task for cosuming.

0x42 Task consumer.

Task consumer finds the task in the queue, then exeucute the task in the specific thread.
Here is the diagram about how vertex processor and setup processor is executed in one thread.

Here is the diagram about how pixel processor is executed in one thread.

Timer support in event driven network framework

发表于 2017-07-08 | 阅读次数

0x1 Introduction

Typically there is one event loop in event driven asynchronous network framework, it waits on the file descriptors to become ready then to perform specific operations, typically the event loop is executed in one thread. And such network framework also supports timer, one type of timer is to process some operation after some time, another type of timer is to process some operation at some intervals.

Next we will look into the detail timer’s support in serveral network framework.

0x2 Nginx’s implementation

In Nginx, ngx_process_events_and_timers() will handle timer, it detects the expired timers then call its handler to do further operation.

From the following code, there is one variable ngx_timer_resolution, it is used to handle timer in two different ways.

If ngx_timer_resolution is not zero, the timeout parameter is set as -1, then it will pass to ngx_process_events(), if timeout parameter is -1, epoll_wait() will wait infinitely until some file descriptors become ready, so the timer seems not accurate at that time. And ngx_timer_resolution is configured in nginx’s configuration file.

If ngx_timer_resolution is zero, it will call ngx_event_find_timer() to get the delta time between the next expired time and current time, the next expired time is sotred in the rbtree.

In ngx_process_events_and_timers(), it will call ngx_event_expire_timers() to process the expired timers, and call the timer’handler.

void ngx_process_events_and_timers(ngx_cycle_t *cycle)
{
    ngx_uint_t  flags;
    ngx_msec_t  timer, delta;
    if (ngx_timer_resolution) {
        timer = NGX_TIMER_INFINITE;
        flags = 0;
    } else {
        timer = ngx_event_find_timer();
        flags = NGX_UPDATE_TIME;
    }
    ...
    delta = ngx_current_msec;
    (void) ngx_process_events(cycle, timer, flags);
    delta = ngx_current_msec - delta;
    ngx_event_process_posted(cycle, &ngx_posted_accept_events);
    ...
    if (delta) {
        ngx_event_expire_timers();
    }
    ngx_event_process_posted(cycle, &ngx_posted_events);
}
static ngx_int_t ngx_epoll_process_events(ngx_cycle_t *cycle, ngx_msec_t timer, ngx_uint_t flags)
{
    ...
    events = epoll_wait(ep, event_list, (int) nevents, timer);
    ...
    if (flags & NGX_UPDATE_TIME || ngx_event_timer_alarm) {
        ngx_time_update();
    }
    ...
    for (i = 0; i < events; i++) {
        c = event_list[i].data.ptr;
        ...
        revents = event_list[i].events;
        ...        
        wev = c->write;
        ...
        if (flags & NGX_POST_EVENTS) {
                ngx_post_event(wev, &ngx_posted_events);
            } else {
                wev->handler(wev);
        }
        ...
}

In ngx_epoll_process_events(), if will call ngx_time_update() to update time, and ngx_time_update() will call the system function gettimeofday(), and the system performance will be impacted if we call gettimeofday() frequently.

void ngx_time_update(void)
{
    ...
    ngx_gettimeofday(&tv);
    ...
    sec = tv.tv_sec;
    msec = tv.tv_usec / 1000;
    ...
}

void ngx_event_expire_timers(void)
{
    ...
    for ( ;; ) {
        root = ngx_event_timer_rbtree.root;
        ...
        if (root == sentinel) {
            return;
        }
        ...
        node = ngx_rbtree_min(root, sentinel);
        ...
        if ((ngx_msec_int_t) (node->key - ngx_current_msec) > 0) {
            return;
        }
        ev = (ngx_event_t *) ((char *) node - offsetof(ngx_event_t, timer));
        ngx_rbtree_delete(&ngx_event_timer_rbtree, &ev->timer);
        ev->timer_set = 0;
        ev->timedout = 1;
        ev->handler(ev);
    }
}

0x3 Redis’s implementation

Here are the functions of Redis to add/delete timer, the timer’s user can use those functions to get timer service from the event loop thread. In Redis, the serverCron() is register as the timer’s callback when calling aeCreateTimeEvent(), then serverCron() will be called at some interval to do resource and status checking of Redis Server.

long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds,
        aeTimeProc *proc, void *clientData,
        aeEventFinalizerProc *finalizerProc)
{
    long long id = eventLoop->timeEventNextId++;
    aeTimeEvent *te;
    te = zmalloc(sizeof(*te));
    if (te == NULL) return AE_ERR;
    te->id = id;
    aeAddMillisecondsToNow(milliseconds,&te->when_sec,&te->when_ms);
    te->timeProc = proc;
    te->finalizerProc = finalizerProc;
    te->clientData = clientData;
    te->next = eventLoop->timeEventHead;
    eventLoop->timeEventHead = te;
    return id;
}
int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id)
{
    aeTimeEvent *te = eventLoop->timeEventHead;
    while(te) {
        if (te->id == id) {
            te->id = AE_DELETED_EVENT_ID;
            return AE_OK;
        }
        te = te->next;
    }
    return AE_ERR;
}

Here is the code about how timer is handled in the event loop, it searches the nearnest timer in the timer list, then calculates the timout parameter for the aeApiPoll(), then aeApiPoll() will wait on the file descriptors within the timout setting, after aeApiPoll(), it uses processTimeEvents() to process the timers and call its callback of the specific timer.

int aeProcessEvents(aeEventLoop *eventLoop, int flags)
{
        ...
        aeTimeEvent *shortest = NULL;
        struct timeval tv, *tvp;
        if (flags & AE_TIME_EVENTS && !(flags & AE_DONT_WAIT))
            shortest = aeSearchNearestTimer(eventLoop);
        if (shortest) {
            long now_sec, now_ms;
            aeGetTime(&now_sec, &now_ms);
            tvp = &tv;
            long long ms =
                (shortest->when_sec - now_sec)*1000 +
                shortest->when_ms - now_ms;
            if (ms > 0) {
                tvp->tv_sec = ms/1000;
                tvp->tv_usec = (ms % 1000)*1000;
            } else {
                tvp->tv_sec = 0;
                tvp->tv_usec = 0;
            }
        } else {
            if (flags & AE_DONT_WAIT) {
                tv.tv_sec = tv.tv_usec = 0;
                tvp = &tv;
            } else {
                tvp = NULL; /* wait forever */
            }
        }
        ...
        numevents = aeApiPoll(eventLoop, tvp);
        ...
    }
    ...
    if (flags & AE_TIME_EVENTS)
        processed += processTimeEvents(eventLoop);
    ...
}
static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
    aeApiState *state = eventLoop->apidata;
    int retval, numevents = 0;
    retval = epoll_wait(state->epfd,state->events,eventLoop->setsize,
            tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
    if (retval > 0) {
        int j;
        ...
        numevents = retval;
        for (j = 0; j < numevents; j++) {
            int mask = 0;
            struct epoll_event *e = state->events+j;
            ...
            if (e->events & EPOLLIN) mask |= AE_READABLE;
            if (e->events & EPOLLOUT) mask |= AE_WRITABLE;
            if (e->events & EPOLLERR) mask |= AE_WRITABLE;
            if (e->events & EPOLLHUP) mask |= AE_WRITABLE;
            eventLoop->fired[j].fd = e->data.fd;
            eventLoop->fired[j].mask = mask;
        }
    }
    return numevents;
}
static int processTimeEvents(aeEventLoop *eventLoop) {
    int processed = 0;
    aeTimeEvent *te, *prev;
    long long maxId;
    time_t now = time(NULL);
    ...
    if (now < eventLoop->lastTime) {
        te = eventLoop->timeEventHead;
        while(te) {
            te->when_sec = 0;
            te = te->next;
        }
    }
    eventLoop->lastTime = now;
    ...
    prev = NULL;
    te = eventLoop->timeEventHead;
    maxId = eventLoop->timeEventNextId-1;
    while(te) {
        long now_sec, now_ms;
        long long id;
        ...
        aeGetTime(&now_sec, &now_ms);
        if (now_sec > te->when_sec ||
            (now_sec == te->when_sec && now_ms >= te->when_ms))
        {
            int retval;
            id = te->id;
            retval = te->timeProc(eventLoop, id, te->clientData);
            processed++;
            if (retval != AE_NOMORE) {
                aeAddMillisecondsToNow(retval,&te->when_sec,&te->when_ms);
            } else {
                te->id = AE_DELETED_EVENT_ID;
            }
        }
        prev = te;
        te = te->next;
    }
    return processed;
}

0x4 Libevent’s implementation

libevent uses timerfd to handle timer when USING_TIMERFD is set, here is the code, it uses timerfd_create() to create timefd.

static void * epoll_init(struct event_base *base)
{
#ifdef USING_TIMERFD
	if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) &&
	    base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) {
		int fd;
		fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
		if (epollop->timerfd >= 0) {
			struct epoll_event epev;
			memset(&epev, 0, sizeof(epev));
			epev.data.fd = epollop->timerfd;
			epev.events = EPOLLIN;
			if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) {
				event_warn("epoll_ctl(timerfd)");
				close(fd);
				epollop->timerfd = -1;
			}
        } 
        ...
	}
#endif
}

Here is the code to set the timeout for timerfd, it call timerfd_settime() to set the timeout. When timeout happens, it will trigger the timerfd which is waiting on epoll_wait(), then the event loop can process the timer handler.

static int epoll_dispatch(struct event_base *base, struct timeval *tv)
{
#ifdef USING_TIMERFD
	if (epollop->timerfd >= 0) {
		struct itimerspec is;
		is.it_interval.tv_sec = 0;
		is.it_interval.tv_nsec = 0;
		if (tv == NULL) {
			is.it_value.tv_sec = 0;
			is.it_value.tv_nsec = 0;
		} else {
			if (tv->tv_sec == 0 && tv->tv_usec == 0) {
				timeout = 0;
			}
			is.it_value.tv_sec = tv->tv_sec;
			is.it_value.tv_nsec = tv->tv_usec * 1000;
		}
		if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) {
			event_warn("timerfd_settime");
		}
	}
#endif
    ...
    res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout);
    ...
}

Timerfd’s kernel implementation

In Linux kernel, timerfd’s implementation uses hrtimer to support timer, the corresponding code is placed in kernel/fs/timerfd.c.