diff --git a/CMakeLists.txt b/CMakeLists.txt
index e89f3c0660572df22e2c646200137db1b3469b9a..a8a58375d9ef7e0efda18bd9edf4fd9d727d87a6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,7 +40,7 @@ SET(CXX_DISABLE_WERROR True)
 OPTION(ENABLE_VECTORIZATION "Enable vectorization and futhers processor-related optimizations" OFF)
 OPTION(BUILD_PYTHON_INTERFACE "Build the python binding" ON)
 OPTION(BUILD_UNIT_TESTS "Build the unitary tests" ON)
-OPTION(BUILD_BENCHMARK "Build the benchmark" OFF)
+OPTION(BUILD_BENCHMARK "Build the benchmark" ON)
 
 
 IF(ENABLE_VECTORIZATION)
@@ -57,6 +57,29 @@ ADD_OPTIONAL_DEPENDENCY("multicontact-api >= 1.1.0")
 ADD_OPTIONAL_DEPENDENCY("quadprog")
 ADD_OPTIONAL_DEPENDENCY("scipy")
 
+
+OPTION(BUILD_WITH_MULTITHREADS "Build the library with the OpenMP support (required OpenMP)" OFF)
+IF(BUILD_WITH_MULTITHREADS)
+  SET(BUILD_WITH_NTHREADS "4" CACHE STRING "Number of threads")
+  string(REGEX MATCH "^[0-9]+$" BUILD_WITH_NTHREADS  ${BUILD_WITH_NTHREADS})
+  IF(NOT BUILD_WITH_NTHREADS MATCHES "^[0-9]+$")
+    SET(BUILD_WITH_NTHREADS 4)
+    MESSAGE("Warning: the number of threads have to be an interger value, set to ${BUILD_WITH_NTHREADS}")
+  ENDIF()
+ENDIF()
+
+# Add OpenMP
+if(BUILD_WITH_MULTITHREADS)
+  find_package(OpenMP)
+ENDIF()
+if(OPENMP_FOUND AND BUILD_WITH_MULTITHREADS)
+  SET(CMAKE_CXX_FLAGS "-fopenmp")
+  ADD_DEFINITIONS(-DWITH_MULTITHREADING)
+  ADD_DEFINITIONS(-DWITH_NTHREADS=${BUILD_WITH_NTHREADS})
+  LIST(APPEND CFLAGS_DEPENDENCIES "-DWITH_MULTITHREADING" "-DWITH_NTHREADS")
+ENDIF()
+
+
 SET(BOOST_REQUIERED_COMPONENTS filesystem serialization system)
 SET(BOOST_BUILD_COMPONENTS unit_test_framework)
 SET(BOOST_OPTIONAL_COMPONENTS "")
@@ -97,4 +120,4 @@ IF(BUILD_BENCHMARK)
 ENDIF(BUILD_BENCHMARK)
 
 
-SETUP_PROJECT_FINALIZE()
\ No newline at end of file
+SETUP_PROJECT_FINALIZE()
diff --git a/benchmark/lqr.cpp b/benchmark/lqr.cpp
index 43b4d995ce174e45a9fe92b942ebc4654c43922b..19d9f6104f733999f02872b2936b57aace639cc6 100644
--- a/benchmark/lqr.cpp
+++ b/benchmark/lqr.cpp
@@ -40,17 +40,52 @@ int main() {
   }
 
   // Solving the optimal control problem
-  std::clock_t c_start, c_end;
+  struct timespec start, finish;
+  double elapsed;
   Eigen::ArrayXd duration(T);
   for (unsigned int i = 0; i < T; ++i) {
-    c_start = std::clock();
+    clock_gettime(CLOCK_MONOTONIC, &start);
     ddp.solve(xs, us, MAXITER);
-    c_end = std::clock();
-    duration[i] = 1e3 * (double)(c_end - c_start) / CLOCKS_PER_SEC;
+    clock_gettime(CLOCK_MONOTONIC, &finish);
+    elapsed = (finish.tv_sec - start.tv_sec) * 1000000.0;
+    elapsed += (finish.tv_nsec - start.tv_nsec) / 1000.0;
+    duration[i] = elapsed / 1000.;
   }
 
   double avrg_duration = duration.sum() / T;
   double min_duration = duration.minCoeff();
   double max_duration = duration.maxCoeff();
-  std::cout << "CPU time [ms]: " << avrg_duration << " (" << min_duration << "-" << max_duration << ")" << std::endl;
+  std::cout << "Wall time [mu]: " << avrg_duration << " (" << min_duration << "-" << max_duration << ")" << std::endl;
+
+  // Running calc
+  for (unsigned int i = 0; i < T; ++i) {
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    problem.calc(xs, us);
+    clock_gettime(CLOCK_MONOTONIC, &finish);
+    elapsed = (finish.tv_sec - start.tv_sec) * 1000000.0;
+    elapsed += (finish.tv_nsec - start.tv_nsec) / 1000.0;
+    duration[i] = elapsed / 1000.;
+  }
+
+  avrg_duration = duration.sum() / T;
+  min_duration = duration.minCoeff();
+  max_duration = duration.maxCoeff();
+  std::cout << "Wall time calc [ms]: " << avrg_duration << " (" << min_duration << "-" << max_duration << ")"
+            << std::endl;
+
+  // Running calcDiff
+  for (unsigned int i = 0; i < T; ++i) {
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    problem.calcDiff(xs, us);
+    clock_gettime(CLOCK_MONOTONIC, &finish);
+    elapsed = (finish.tv_sec - start.tv_sec) * 1000000.0;
+    elapsed += (finish.tv_nsec - start.tv_nsec) / 1000.0;
+    duration[i] = elapsed / 1000.;
+  }
+
+  avrg_duration = duration.sum() / T;
+  min_duration = duration.minCoeff();
+  max_duration = duration.maxCoeff();
+  std::cout << "Wall time calcDiff [ms]: " << avrg_duration << " (" << min_duration << "-" << max_duration << ")"
+            << std::endl;
 }
\ No newline at end of file
diff --git a/benchmark/unicycle.cpp b/benchmark/unicycle.cpp
index 07516cf466c7a103d3aec03adc2bd6410874e7d8..92a726560e29b55110a93b8d98ea513d86d88117 100644
--- a/benchmark/unicycle.cpp
+++ b/benchmark/unicycle.cpp
@@ -1,7 +1,11 @@
 #include "crocoddyl/core/actions/unicycle.hpp"
 #include "crocoddyl/core/utils/callbacks.hpp"
 #include "crocoddyl/core/solvers/ddp.hpp"
-#include <ctime>
+#include <time.h>
+
+#ifdef WITH_MULTITHREADING
+#include <omp.h>
+#endif  // WITH_MULTITHREADING
 
 int main() {
   bool CALLBACKS = false;
@@ -37,17 +41,53 @@ int main() {
   }
 
   // Solving the optimal control problem
-  std::clock_t c_start, c_end;
+  struct timespec start, finish;
+  double elapsed;
   Eigen::ArrayXd duration(T);
   for (unsigned int i = 0; i < T; ++i) {
-    c_start = std::clock();
+    clock_gettime(CLOCK_MONOTONIC, &start);
     ddp.solve(xs, us, MAXITER);
-    c_end = std::clock();
-    duration[i] = 1e3 * (double)(c_end - c_start) / CLOCKS_PER_SEC;
+    clock_gettime(CLOCK_MONOTONIC, &finish);
+    elapsed = (finish.tv_sec - start.tv_sec) * 1000000.0;
+    elapsed += (finish.tv_nsec - start.tv_nsec) / 1000.0;
+    duration[i] = elapsed / 1000.;
   }
 
   double avrg_duration = duration.sum() / T;
   double min_duration = duration.minCoeff();
   double max_duration = duration.maxCoeff();
-  std::cout << "CPU time [ms]: " << avrg_duration << " (" << min_duration << "-" << max_duration << ")" << std::endl;
+  std::cout << "Wall time solve [ms]: " << avrg_duration << " (" << min_duration << "-" << max_duration << ")"
+            << std::endl;
+
+  // Running calc
+  for (unsigned int i = 0; i < T; ++i) {
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    problem.calc(xs, us);
+    clock_gettime(CLOCK_MONOTONIC, &finish);
+    elapsed = (finish.tv_sec - start.tv_sec) * 1000000.0;
+    elapsed += (finish.tv_nsec - start.tv_nsec) / 1000.0;
+    duration[i] = elapsed / 1000.;
+  }
+
+  avrg_duration = duration.sum() / T;
+  min_duration = duration.minCoeff();
+  max_duration = duration.maxCoeff();
+  std::cout << "Wall time calc [ms]: " << avrg_duration << " (" << min_duration << "-" << max_duration << ")"
+            << std::endl;
+
+  // Running calcDiff
+  for (unsigned int i = 0; i < T; ++i) {
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    problem.calcDiff(xs, us);
+    clock_gettime(CLOCK_MONOTONIC, &finish);
+    elapsed = (finish.tv_sec - start.tv_sec) * 1000000.0;
+    elapsed += (finish.tv_nsec - start.tv_nsec) / 1000.0;
+    duration[i] = elapsed / 1000.;
+  }
+
+  avrg_duration = duration.sum() / T;
+  min_duration = duration.minCoeff();
+  max_duration = duration.maxCoeff();
+  std::cout << "Wall time calcDiff [ms]: " << avrg_duration << " (" << min_duration << "-" << max_duration << ")"
+            << std::endl;
 }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b906f4ef5f914b703a6688f6d5a379f175f009aa..4f49c86a7a6212967c26dad1bc72b6fa7731030d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -49,6 +49,10 @@ IF(UNIX)
   PKG_CONFIG_USE_DEPENDENCY(${PROJECT_NAME} pinocchio)
   TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${Boost_FILESYSTEM_LIBRARY} ${Boost_SYSTEM_LIBRARY} ${Boost_SERIALIZATION_LIBRARY})
 
+  if(OPENMP_FOUND)
+    TARGET_LINK_LIBRARIES(${PROJECT_NAME} ${OpenMP_CXX_LIBRARIES})
+  ENDIF()
+  
   INSTALL(TARGETS ${PROJECT_NAME} DESTINATION lib)
   INSTALL(DIRECTORY ${CMAKE_SOURCE_DIR}/include/
           DESTINATION include
diff --git a/src/core/optctrl/shooting.cpp b/src/core/optctrl/shooting.cpp
index c85e4c65624a5dcc96548c1de1f248f57623b07f..b3f185433e3f8ba32bbdba0ec290e15127341bf9 100644
--- a/src/core/optctrl/shooting.cpp
+++ b/src/core/optctrl/shooting.cpp
@@ -7,6 +7,11 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 #include "crocoddyl/core/optctrl/shooting.hpp"
+#include <iostream>
+#ifdef WITH_MULTITHREADING
+#include <omp.h>
+#define NUM_THREADS WITH_NTHREADS
+#endif  // WITH_MULTITHREADING
 
 namespace crocoddyl {
 
@@ -47,15 +52,20 @@ double ShootingProblem::calcDiff(const std::vector<Eigen::VectorXd>& xs, const s
   assert(us.size() == T_ && "Wrong dimension of the control trajectory, it should be T.");
 
   cost_ = 0;
-  for (unsigned int i = 0; i < T_; ++i) {
-    ActionModelAbstract* model = running_models_[i];
-    boost::shared_ptr<ActionDataAbstract>& data = running_datas_[i];
-    const Eigen::VectorXd& x = xs[i];
-    const Eigen::VectorXd& u = us[i];
+  unsigned int i;
+
+#ifdef WITH_MULTITHREADING
+  omp_set_num_threads(NUM_THREADS);
+#pragma omp parallel for
+#endif
+  for (i = 0; i < T_; ++i) {
+    running_models_[i]->calcDiff(running_datas_[i], xs[i], us[i]);
+  }
 
-    model->calcDiff(data, x, u);
-    cost_ += data->cost;
+  for (unsigned int i = 0; i < T_; ++i) {
+    cost_ += running_datas_[i]->cost;
   }
+
   terminal_model_->calcDiff(terminal_data_, xs.back());
   cost_ += terminal_data_->cost;
   return cost_;