diff --git a/SConstruct b/SConstruct
index 5a111b561..73c66690a 100644
--- a/SConstruct
+++ b/SConstruct
@@ -68,6 +68,7 @@ lenv = {
   "PYTHONPATH": Dir("#").abspath + ":" + Dir("#pyextra/").abspath,
 
   "ACADOS_SOURCE_DIR": Dir("#third_party/acados/acados").abspath,
+  "ACADOS_PYTHON_INTERFACE_PATH": Dir("#pyextra/acados_template").abspath,
   "TERA_PATH": Dir("#").abspath + f"/third_party/acados/{arch}/t_renderer",
 }
 
diff --git a/pyextra/acados_template/.gitignore b/pyextra/acados_template/.gitignore
index c18dd8d83..63b741bb9 100644
--- a/pyextra/acados_template/.gitignore
+++ b/pyextra/acados_template/.gitignore
@@ -1 +1,6 @@
 __pycache__/
+
+# Cython intermediates
+*_pyx.c
+*_pyx.o
+*_pyx.so
diff --git a/pyextra/acados_template/acados_layout.json b/pyextra/acados_template/acados_layout.json
index f4e332351..16fd45233 100644
--- a/pyextra/acados_template/acados_layout.json
+++ b/pyextra/acados_template/acados_layout.json
@@ -666,6 +666,9 @@
         "nlp_solver_type": [
             "str"
         ],
+        "collocation_type": [
+            "str"
+        ],
         "globalization": [
             "str"
         ],
diff --git a/pyextra/acados_template/acados_model.py b/pyextra/acados_template/acados_model.py
index 379ade0d9..4871a2c0d 100644
--- a/pyextra/acados_template/acados_model.py
+++ b/pyextra/acados_template/acados_model.py
@@ -96,6 +96,9 @@ class AcadosModel():
         self.cost_expr_ext_cost = None  #: CasADi expression for external cost; Default: :code:`None`
         self.cost_expr_ext_cost_e = None  #: CasADi expression for external cost, terminal; Default: :code:`None`
         self.cost_expr_ext_cost_0 = None  #: CasADi expression for external cost, initial; Default: :code:`None`
+        self.cost_expr_ext_cost_custom_hess = None  #: CasADi expression for custom hessian (only for external cost); Default: :code:`None`
+        self.cost_expr_ext_cost_custom_hess_e = None  #: CasADi expression for custom hessian (only for external cost), terminal; Default: :code:`None`
+        self.cost_expr_ext_cost_custom_hess_0 = None  #: CasADi expression for custom hessian (only for external cost), initial; Default: :code:`None`
 
 
 def acados_model_strip_casadi_symbolics(model):
@@ -147,5 +150,11 @@ def acados_model_strip_casadi_symbolics(model):
         del out['cost_expr_ext_cost_e']
     if 'cost_expr_ext_cost_0' in out.keys():
         del out['cost_expr_ext_cost_0']
+    if 'cost_expr_ext_cost_custom_hess' in out.keys():
+        del out['cost_expr_ext_cost_custom_hess']
+    if 'cost_expr_ext_cost_custom_hess_e' in out.keys():
+        del out['cost_expr_ext_cost_custom_hess_e']
+    if 'cost_expr_ext_cost_custom_hess_0' in out.keys():
+        del out['cost_expr_ext_cost_custom_hess_0']
 
     return out
diff --git a/pyextra/acados_template/acados_ocp.py b/pyextra/acados_template/acados_ocp.py
index 9c2f02aa8..198ab033d 100644
--- a/pyextra/acados_template/acados_ocp.py
+++ b/pyextra/acados_template/acados_ocp.py
@@ -2120,6 +2120,7 @@ class AcadosOcpOptions:
         self.__globalization = 'FIXED_STEP'
         self.__nlp_solver_step_length = 1.0                   # fixed Newton step length
         self.__levenberg_marquardt = 0.0
+        self.__collocation_type = 'GAUSS_LEGENDRE'
         self.__sim_method_num_stages  = 4                     # number of stages in the integrator
         self.__sim_method_num_steps   = 1                     # number of steps in the integrator
         self.__sim_method_newton_iter = 3                     # number of Newton iterations in simulation method
@@ -2195,6 +2196,15 @@ class AcadosOcpOptions:
         """
         return self.__globalization
 
+    @property
+    def collocation_type(self):
+        """Collocation type: relevant for implicit integrators
+        -- string in {GAUSS_RADAU_IIA, GAUSS_LEGENDRE}.
+
+        Default: GAUSS_LEGENDRE
+        """
+        return self.__collocation_type
+
     @property
     def regularize_method(self):
         """Regularization method for the Hessian.
@@ -2481,6 +2491,15 @@ class AcadosOcpOptions:
             raise Exception('Invalid regularize_method value. Possible values are:\n\n' \
                     + ',\n'.join(regularize_methods) + '.\n\nYou have: ' + regularize_method + '.\n\nExiting.')
 
+    @collocation_type.setter
+    def collocation_type(self, collocation_type):
+        collocation_types = ('GAUSS_RADAU_IIA', 'GAUSS_LEGENDRE')
+        if collocation_type in collocation_types:
+            self.__collocation_type = collocation_type
+        else:
+            raise Exception('Invalid collocation_type value. Possible values are:\n\n' \
+                    + ',\n'.join(collocation_types) + '.\n\nYou have: ' + collocation_type + '.\n\nExiting.')
+
     @hessian_approx.setter
     def hessian_approx(self, hessian_approx):
         hessian_approxs = ('GAUSS_NEWTON', 'EXACT')
diff --git a/pyextra/acados_template/acados_ocp_solver.py b/pyextra/acados_template/acados_ocp_solver.py
index 4e80e1205..b34b8fa84 100644
--- a/pyextra/acados_template/acados_ocp_solver.py
+++ b/pyextra/acados_template/acados_ocp_solver.py
@@ -53,7 +53,7 @@ from .acados_ocp import AcadosOcp
 from .acados_model import acados_model_strip_casadi_symbolics
 from .utils import is_column, is_empty, casadi_length, render_template, acados_class2dict,\
      format_class_dict, ocp_check_against_layout, np_array_to_list, make_model_consistent,\
-     set_up_imported_gnsf_model, get_acados_path
+     set_up_imported_gnsf_model, get_acados_path, get_ocp_nlp_layout, get_python_interface_path
 
 
 def make_ocp_dims_consistent(acados_ocp):
@@ -102,6 +102,7 @@ def make_ocp_dims_consistent(acados_ocp):
         cost.cost_ext_fun_type_0 = cost.cost_ext_fun_type
         model.cost_y_expr_0 = model.cost_y_expr
         model.cost_expr_ext_cost_0 = model.cost_expr_ext_cost
+        model.cost_expr_ext_cost_custom_hess_0 = model.cost_expr_ext_cost_custom_hess
 
     if cost.cost_type_0 == 'LINEAR_LS':
         ny_0 = cost.W_0.shape[0]
@@ -433,9 +434,19 @@ def make_ocp_dims_consistent(acados_ocp):
         if np.shape(opts.shooting_nodes)[0] != dims.N+1:
             raise Exception('inconsistent dimension N, regarding shooting_nodes.')
 
+        # time_steps = opts.shooting_nodes[1:] - opts.shooting_nodes[0:-1]
+        # # identify constant time-steps: due to numerical reasons the content of time_steps might vary a bit
+        # delta_time_steps = time_steps[1:] - time_steps[0:-1]
+        # avg_time_steps = np.average(time_steps)
+        # # criterion for constant time-step detection: the min/max difference in values normalized by the average
+        # check_const_time_step = np.max(delta_time_steps)-np.min(delta_time_steps) / avg_time_steps
+        # # if the criterion is small, we have a constant time-step
+        # if check_const_time_step < 1e-9:
+        #     time_steps[:] = avg_time_steps  # if we have a constant time-step: apply the average time-step
         time_steps = np.zeros((dims.N,))
         for i in range(dims.N):
-            time_steps[i] = opts.shooting_nodes[i+1] - opts.shooting_nodes[i]
+            time_steps[i] = opts.shooting_nodes[i+1] - opts.shooting_nodes[i] # TODO use commented code above
+
         opts.time_steps = time_steps
 
     elif (not is_empty(opts.time_steps)) and (not is_empty(opts.shooting_nodes)):
@@ -483,13 +494,12 @@ def make_ocp_dims_consistent(acados_ocp):
         raise Exception("Wrong value for sim_method_jac_reuse. Should be either int or array of ints of shape (N,).")
 
 
-
-def get_ocp_nlp_layout():
-    current_module = sys.modules[__name__]
-    acados_path = os.path.dirname(current_module.__file__)
-    with open(acados_path + '/acados_layout.json', 'r') as f:
-        ocp_nlp_layout = json.load(f)
-    return ocp_nlp_layout
+def get_simulink_default_opts():
+    python_interface_path = get_python_interface_path()
+    abs_path = os.path.join(python_interface_path, 'simulink_default_opts.json')
+    with open(abs_path , 'r') as f:
+        simulink_default_opts = json.load(f)
+    return simulink_default_opts
 
 
 def ocp_formulation_json_dump(acados_ocp, simulink_opts, json_file='acados_ocp_nlp.json'):
@@ -622,9 +632,7 @@ def ocp_render_templates(acados_ocp, json_file):
     name = acados_ocp.model.name
 
     # setting up loader and environment
-    json_path = '{cwd}/{json_file}'.format(
-        cwd=os.getcwd(),
-        json_file=json_file)
+    json_path = os.path.join(os.getcwd(), json_file)
 
     if not os.path.exists(json_path):
         raise Exception('{} not found!'.format(json_path))
@@ -645,6 +653,10 @@ def ocp_render_templates(acados_ocp, json_file):
     out_file = f'acados_solver_{name}.h'
     render_template(in_file, out_file, template_dir, json_path)
 
+    in_file = 'acados_solver.in.pxd'
+    out_file = f'acados_solver.pxd'
+    render_template(in_file, out_file, template_dir, json_path)
+
     in_file = 'Makefile.in'
     out_file = 'Makefile'
     render_template(in_file, out_file, template_dir, json_path)
@@ -671,8 +683,7 @@ def ocp_render_templates(acados_ocp, json_file):
     render_template(in_file, out_file, template_dir, json_path)
 
     ## folder model
-    template_dir = f'{code_export_dir}/{name}_model/'
-
+    template_dir = os.path.join(code_export_dir, name + '_model')
     in_file = 'model.in.h'
     out_file = f'{name}_model.h'
     render_template(in_file, out_file, template_dir, json_path)
@@ -680,7 +691,7 @@ def ocp_render_templates(acados_ocp, json_file):
     # constraints on convex over nonlinear function
     if acados_ocp.constraints.constr_type == 'BGP' and acados_ocp.dims.nphi > 0:
         # constraints on outer function
-        template_dir = f'{code_export_dir}/{name}_constraints/'
+        template_dir = os.path.join(code_export_dir, name + '_constraints')
         in_file = 'phi_constraint.in.h'
         out_file = f'{name}_phi_constraint.h'
         render_template(in_file, out_file, template_dir, json_path)
@@ -688,62 +699,62 @@ def ocp_render_templates(acados_ocp, json_file):
     # terminal constraints on convex over nonlinear function
     if acados_ocp.constraints.constr_type_e == 'BGP' and acados_ocp.dims.nphi_e > 0:
         # terminal constraints on outer function
-        template_dir = f'{code_export_dir}/{name}_constraints/'
+        template_dir = os.path.join(code_export_dir, name + '_constraints')
         in_file = 'phi_e_constraint.in.h'
         out_file = f'{name}_phi_e_constraint.h'
         render_template(in_file, out_file, template_dir, json_path)
 
     # nonlinear constraints
     if acados_ocp.constraints.constr_type == 'BGH' and acados_ocp.dims.nh > 0:
-        template_dir = f'{code_export_dir}/{name}_constraints/'
+        template_dir = os.path.join(code_export_dir, name + '_constraints')
         in_file = 'h_constraint.in.h'
         out_file = f'{name}_h_constraint.h'
         render_template(in_file, out_file, template_dir, json_path)
 
     # terminal nonlinear constraints
     if acados_ocp.constraints.constr_type_e == 'BGH' and acados_ocp.dims.nh_e > 0:
-        template_dir = f'{code_export_dir}/{name}_constraints/'
+        template_dir = os.path.join(code_export_dir, name + '_constraints')
         in_file = 'h_e_constraint.in.h'
         out_file = f'{name}_h_e_constraint.h'
         render_template(in_file, out_file, template_dir, json_path)
 
     # initial stage Nonlinear LS cost function
     if acados_ocp.cost.cost_type_0 == 'NONLINEAR_LS':
-        template_dir = f'{code_export_dir}/{name}_cost/'
+        template_dir = os.path.join(code_export_dir, name + '_cost')
         in_file = 'cost_y_0_fun.in.h'
         out_file = f'{name}_cost_y_0_fun.h'
         render_template(in_file, out_file, template_dir, json_path)
     # external cost - terminal
     elif acados_ocp.cost.cost_type_0 == 'EXTERNAL':
-        template_dir = f'{code_export_dir}/{name}_cost/'
+        template_dir = os.path.join(code_export_dir, name + '_cost')
         in_file = 'external_cost_0.in.h'
         out_file = f'{name}_external_cost_0.h'
         render_template(in_file, out_file, template_dir, json_path)
 
     # path Nonlinear LS cost function
     if acados_ocp.cost.cost_type == 'NONLINEAR_LS':
-        template_dir = f'{code_export_dir}/{name}_cost/'
+        template_dir = os.path.join(code_export_dir, name + '_cost')
         in_file = 'cost_y_fun.in.h'
         out_file = f'{name}_cost_y_fun.h'
         render_template(in_file, out_file, template_dir, json_path)
 
     # terminal Nonlinear LS cost function
     if acados_ocp.cost.cost_type_e == 'NONLINEAR_LS':
-        template_dir = f'{code_export_dir}/{name}_cost/'
+        template_dir = os.path.join(code_export_dir, name + '_cost')
         in_file = 'cost_y_e_fun.in.h'
         out_file = f'{name}_cost_y_e_fun.h'
         render_template(in_file, out_file, template_dir, json_path)
 
     # external cost
     if acados_ocp.cost.cost_type == 'EXTERNAL':
-        template_dir = f'{code_export_dir}/{name}_cost/'
+        template_dir = os.path.join(code_export_dir, name + '_cost')
         in_file = 'external_cost.in.h'
         out_file = f'{name}_external_cost.h'
         render_template(in_file, out_file, template_dir, json_path)
 
     # external cost - terminal
     if acados_ocp.cost.cost_type_e == 'EXTERNAL':
-        template_dir = f'{code_export_dir}/{name}_cost/'
+        template_dir = os.path.join(code_export_dir, name + '_cost')
         in_file = 'external_cost_e.in.h'
         out_file = f'{name}_external_cost_e.h'
         render_template(in_file, out_file, template_dir, json_path)
@@ -775,9 +786,7 @@ class AcadosOcpSolver:
         model = acados_ocp.model
 
         if simulink_opts is None:
-            json_path = os.path.dirname(os.path.realpath(__file__))
-            with open(json_path + '/simulink_default_opts.json', 'r') as f:
-                simulink_opts = json.load(f)
+            simulink_opts = get_simulink_default_opts()
 
         # make dims consistent
         make_ocp_dims_consistent(acados_ocp)
@@ -830,6 +839,14 @@ class AcadosOcpSolver:
         assert getattr(self.shared_lib, f"{self.model_name}_acados_create")(self.capsule)==0
         self.solver_created = True
 
+        # get pointers solver
+        self.__get_pointers_solver()
+
+
+    def __get_pointers_solver(self):
+        """
+        Private function to get the pointers for solver
+        """
         # get pointers solver
         getattr(self.shared_lib, f"{self.model_name}_acados_get_nlp_opts").argtypes = [c_void_p]
         getattr(self.shared_lib, f"{self.model_name}_acados_get_nlp_opts").restype = c_void_p
@@ -863,14 +880,10 @@ class AcadosOcpSolver:
         self.shared_lib.ocp_nlp_constraint_dims_get_from_attr.argtypes = \
             [c_void_p, c_void_p, c_void_p, c_int, c_char_p, POINTER(c_int)]
         self.shared_lib.ocp_nlp_constraint_dims_get_from_attr.restype = c_int
-        self.shared_lib.ocp_nlp_constraints_model_set_slice.argtypes = \
-            [c_void_p, c_void_p, c_void_p, c_int, c_int, c_char_p, c_void_p, c_int]
 
         self.shared_lib.ocp_nlp_cost_dims_get_from_attr.argtypes = \
             [c_void_p, c_void_p, c_void_p, c_int, c_char_p, POINTER(c_int)]
         self.shared_lib.ocp_nlp_cost_dims_get_from_attr.restype = c_int
-        self.shared_lib.ocp_nlp_cost_model_set_slice.argtypes = \
-            [c_void_p, c_void_p, c_void_p, c_int, c_int, c_char_p, c_void_p, c_int]
 
         self.shared_lib.ocp_nlp_constraints_model_set.argtypes = \
             [c_void_p, c_void_p, c_void_p, c_int, c_char_p, c_void_p]
@@ -881,15 +894,6 @@ class AcadosOcpSolver:
         self.shared_lib.ocp_nlp_set.argtypes = \
             [c_void_p, c_void_p, c_int, c_char_p, c_void_p]
 
-        self.shared_lib.ocp_nlp_dims_get_from_attr.argtypes = \
-            [c_void_p, c_void_p, c_void_p, c_int, c_char_p]
-        self.shared_lib.ocp_nlp_dims_get_from_attr.restype = c_int
-        self.shared_lib.ocp_nlp_out_get_slice.argtypes = \
-            [c_void_p, c_void_p, c_void_p, c_int, c_int, c_char_p, c_void_p]
-        self.shared_lib.ocp_nlp_get_at_stage.argtypes = \
-            [c_void_p, c_void_p, c_void_p, c_int, c_char_p, c_void_p]
-
-
     def solve(self):
         """
         Solve the ocp with current input.
@@ -901,46 +905,118 @@ class AcadosOcpSolver:
         return status
 
 
-    def get_slice(self, start_stage_, end_stage_, field_):
-        dims = self.shared_lib.ocp_nlp_dims_get_from_attr(self.nlp_config, \
-            self.nlp_dims, self.nlp_out, start_stage_, field_)
-        out = np.ascontiguousarray(np.zeros((end_stage_ - start_stage_, dims)), dtype=np.float64)
-        self.fill_in_slice(start_stage_, end_stage_, field_, out)
-        return out
+    def set_new_time_steps(self, new_time_steps):
+        """
+        Set new time steps before solving. Only reload library without code generation but with new time steps.
 
-    def fill_in_slice(self, start_stage_, end_stage_, field_, arr):
-        out_fields = ['x', 'u', 'z', 'pi', 'lam', 't']
-        mem_fields = ['sl', 'su']
+            :param new_time_steps: vector of new time steps for the solver
+
+            .. note:: This allows for different use-cases: either set a new size of time-steps or a new distribution of
+                      the shooting nodes without changing the number, e.g., to reach a different final time. Both cases
+                      do not require a new code export and compilation.
+        """
+
+        # unlikely but still possible
+        if not self.solver_created:
+            raise Exception('Solver was not yet created!')
+
+        # check if time steps really changed in value
+        if np.array_equal(self.acados_ocp.solver_options.time_steps, new_time_steps):
+            return
+
+        N = new_time_steps.size
+        model = self.acados_ocp.model
+        new_time_steps_data = cast(new_time_steps.ctypes.data, POINTER(c_double))
+
+        # check if recreation of acados is necessary (no need to recreate acados if sizes are identical)
+        if self.acados_ocp.solver_options.time_steps.size == N:
+            getattr(self.shared_lib, f"{self.model_name}_acados_update_time_steps").argtypes = [c_void_p, c_int, c_void_p]
+            getattr(self.shared_lib, f"{self.model_name}_acados_update_time_steps").restype = c_int
+            assert getattr(self.shared_lib, f"{self.model_name}_acados_update_time_steps")(self.capsule, N, new_time_steps_data) == 0
+        else:  # recreate the solver with the new time steps
+            self.solver_created = False
+
+            # delete old memory (analog to __del__)
+            getattr(self.shared_lib, f"{self.model_name}_acados_free").argtypes = [c_void_p]
+            getattr(self.shared_lib, f"{self.model_name}_acados_free").restype = c_int
+            getattr(self.shared_lib, f"{self.model_name}_acados_free")(self.capsule)
+
+            # store N and new time steps
+            self.N = self.acados_ocp.dims.N = N
+            self.acados_ocp.solver_options.time_steps = new_time_steps
+            self.acados_ocp.solver_options.Tsim = self.acados_ocp.solver_options.time_steps[0]
+
+            # create solver with new time steps
+            getattr(self.shared_lib, f"{self.model_name}_acados_create_with_discretization").argtypes = [c_void_p, c_int, c_void_p]
+            getattr(self.shared_lib, f"{self.model_name}_acados_create_with_discretization").restype = c_int
+            assert getattr(self.shared_lib, f"{self.model_name}_acados_create_with_discretization")(self.capsule, N, new_time_steps_data) == 0
+
+            self.solver_created = True
+
+            # get pointers solver
+            self.__get_pointers_solver()
+
+
+    def get(self, stage_, field_):
+        """
+        Get the last solution of the solver:
+
+            :param stage: integer corresponding to shooting node
+            :param field: string in ['x', 'u', 'z', 'pi', 'lam', 't', 'sl', 'su',]
+
+            .. note:: regarding lam, t: \n
+                    the inequalities are internally organized in the following order: \n
+                    [ lbu lbx lg lh lphi ubu ubx ug uh uphi; \n
+                      lsbu lsbx lsg lsh lsphi usbu usbx usg ush usphi]
+
+            .. note:: pi: multipliers for dynamics equality constraints \n
+                      lam: multipliers for inequalities \n
+                      t: slack variables corresponding to evaluation of all inequalities (at the solution) \n
+                      sl: slack variables of soft lower inequality constraints \n
+                      su: slack variables of soft upper inequality constraints \n
+        """
+
+        out_fields = ['x', 'u', 'z', 'pi', 'lam', 't', 'sl', 'su']
+        # mem_fields = ['sl', 'su']
         field = field_
         field = field.encode('utf-8')
 
-        if (field_ not in out_fields + mem_fields):
-            raise Exception('AcadosOcpSolver.get_slice(): {} is an invalid argument.\
+        if (field_ not in out_fields):
+            raise Exception('AcadosOcpSolver.get(): {} is an invalid argument.\
                     \n Possible values are {}. Exiting.'.format(field_, out_fields))
 
-        if not isinstance(start_stage_, int):
-            raise Exception('AcadosOcpSolver.get_slice(): stage index must be Integer.')
+        if not isinstance(stage_, int):
+            raise Exception('AcadosOcpSolver.get(): stage index must be Integer.')
 
-        if not isinstance(end_stage_, int):
-            raise Exception('AcadosOcpSolver.get_slice(): stage index must be Integer.')
+        if stage_ < 0 or stage_ > self.N:
+            raise Exception('AcadosOcpSolver.get(): stage index must be in [0, N], got: {}.'.format(self.N))
 
-        if start_stage_ >= end_stage_:
-            raise Exception('AcadosOcpSolver.get_slice(): end stage index must be larger than start stage index')
+        if stage_ == self.N and field_ == 'pi':
+            raise Exception('AcadosOcpSolver.get(): field {} does not exist at final stage {}.'\
+                .format(field_, stage_))
 
-        if start_stage_ < 0 or end_stage_ > self.N + 1:
-            raise Exception('AcadosOcpSolver.get_slice(): stage index must be in [0, N], got: {}.'.format(self.N))
+        self.shared_lib.ocp_nlp_dims_get_from_attr.argtypes = \
+            [c_void_p, c_void_p, c_void_p, c_int, c_char_p]
+        self.shared_lib.ocp_nlp_dims_get_from_attr.restype = c_int
 
-        out_data = cast(arr.ctypes.data, POINTER(c_double))
+        dims = self.shared_lib.ocp_nlp_dims_get_from_attr(self.nlp_config, \
+            self.nlp_dims, self.nlp_out, stage_, field)
+
+        out = np.ascontiguousarray(np.zeros((dims,)), dtype=np.float64)
+        out_data = cast(out.ctypes.data, POINTER(c_double))
 
         if (field_ in out_fields):
-            self.shared_lib.ocp_nlp_out_get_slice(self.nlp_config, \
-                self.nlp_dims, self.nlp_out, start_stage_, end_stage_, field, out_data)
-        elif field_ in mem_fields:
-            self.shared_lib.ocp_nlp_get_at_stage(self.nlp_config, \
-                self.nlp_dims, self.nlp_solver, start_stage_, end_stage_, field, out_data)
+            self.shared_lib.ocp_nlp_out_get.argtypes = \
+                [c_void_p, c_void_p, c_void_p, c_int, c_char_p, c_void_p]
+            self.shared_lib.ocp_nlp_out_get(self.nlp_config, \
+                self.nlp_dims, self.nlp_out, stage_, field, out_data)
+        # elif field_ in mem_fields:
+        #     self.shared_lib.ocp_nlp_get_at_stage.argtypes = \
+        #         [c_void_p, c_void_p, c_void_p, c_int, c_char_p, c_void_p]
+        #     self.shared_lib.ocp_nlp_get_at_stage(self.nlp_config, \
+        #         self.nlp_dims, self.nlp_solver, stage_, field, out_data)
 
-    def get(self, stage_, field_):
-        return self.get_slice(stage_, stage_ + 1, field_)[0]
+        return out
 
 
     def print_statistics(self):
@@ -1176,8 +1252,7 @@ class AcadosOcpSolver:
         """
         cost_fields = ['y_ref', 'yref']
         constraints_fields = ['lbx', 'ubx', 'lbu', 'ubu']
-        out_fields = ['x', 'u', 'pi', 'lam', 't', 'z']
-        mem_fields = ['sl', 'su']
+        out_fields = ['x', 'u', 'pi', 'lam', 't', 'z', 'sl', 'su']
 
         # cast value_ to avoid conversion issues
         if isinstance(value_, (float, int)):
@@ -1189,47 +1264,52 @@ class AcadosOcpSolver:
 
         stage = c_int(stage_)
 
-        if field_ not in constraints_fields + cost_fields + out_fields + mem_fields:
-            raise Exception("AcadosOcpSolver.set(): {} is not a valid argument.\
-                \nPossible values are {}. Exiting.".format(field, \
-                constraints_fields + cost_fields + out_fields + ['p']))
+        # treat parameters separately
+        if field_ == 'p':
+            getattr(self.shared_lib, f"{self.model_name}_acados_update_params").argtypes = [c_void_p, c_int, POINTER(c_double)]
+            getattr(self.shared_lib, f"{self.model_name}_acados_update_params").restype = c_int
 
-        self.shared_lib.ocp_nlp_dims_get_from_attr.argtypes = \
-            [c_void_p, c_void_p, c_void_p, c_int, c_char_p]
-        self.shared_lib.ocp_nlp_dims_get_from_attr.restype = c_int
+            value_data = cast(value_.ctypes.data, POINTER(c_double))
 
-        dims = self.shared_lib.ocp_nlp_dims_get_from_attr(self.nlp_config, \
-            self.nlp_dims, self.nlp_out, stage_, field)
+            assert getattr(self.shared_lib, f"{self.model_name}_acados_update_params")(self.capsule, stage, value_data, value_.shape[0])==0
+        else:
+            if field_ not in constraints_fields + cost_fields + out_fields:
+                raise Exception("AcadosOcpSolver.set(): {} is not a valid argument.\
+                    \nPossible values are {}. Exiting.".format(field, \
+                    constraints_fields + cost_fields + out_fields + ['p']))
 
-        if value_.shape[0] != dims:
-            msg = 'AcadosOcpSolver.set(): mismatching dimension for field "{}" '.format(field_)
-            msg += 'with dimension {} (you have {})'.format(dims, value_.shape)
-            raise Exception(msg)
+            self.shared_lib.ocp_nlp_dims_get_from_attr.argtypes = \
+                [c_void_p, c_void_p, c_void_p, c_int, c_char_p]
+            self.shared_lib.ocp_nlp_dims_get_from_attr.restype = c_int
+
+            dims = self.shared_lib.ocp_nlp_dims_get_from_attr(self.nlp_config, \
+                self.nlp_dims, self.nlp_out, stage_, field)
+
+            if value_.shape[0] != dims:
+                msg = 'AcadosOcpSolver.set(): mismatching dimension for field "{}" '.format(field_)
+                msg += 'with dimension {} (you have {})'.format(dims, value_.shape[0])
+                raise Exception(msg)
+
+            value_data = cast(value_.ctypes.data, POINTER(c_double))
+            value_data_p = cast((value_data), c_void_p)
+
+            if field_ in constraints_fields:
+                self.shared_lib.ocp_nlp_constraints_model_set(self.nlp_config, \
+                    self.nlp_dims, self.nlp_in, stage, field, value_data_p)
+            elif field_ in cost_fields:
+                self.shared_lib.ocp_nlp_cost_model_set(self.nlp_config, \
+                    self.nlp_dims, self.nlp_in, stage, field, value_data_p)
+            elif field_ in out_fields:
+                self.shared_lib.ocp_nlp_out_set(self.nlp_config, \
+                    self.nlp_dims, self.nlp_out, stage, field, value_data_p)
+            # elif field_ in mem_fields:
+            #     self.shared_lib.ocp_nlp_set(self.nlp_config, \
+            #         self.nlp_solver, stage, field, value_data_p)
 
-        value_data_p = cast(value_.ctypes.data, c_void_p)
-        if field_ in constraints_fields:
-            self.shared_lib.ocp_nlp_constraints_model_set(self.nlp_config, \
-                self.nlp_dims, self.nlp_in, stage, field, value_data_p)
-        elif field_ in cost_fields:
-            self.shared_lib.ocp_nlp_cost_model_set(self.nlp_config, \
-                self.nlp_dims, self.nlp_in, stage, field, value_data_p)
-        elif field_ in out_fields:
-            self.shared_lib.ocp_nlp_out_set(self.nlp_config, \
-                self.nlp_dims, self.nlp_out, stage, field, value_data_p)
-        elif field_ in mem_fields:
-            self.shared_lib.ocp_nlp_set(self.nlp_config, \
-                self.nlp_solver, stage, field, value_data_p)
         return
 
 
-    def set_param(self, stage_, value_):
-        value_data = cast(value_.ctypes.data, POINTER(c_double))
-        self._set_param(self.capsule, stage_, value_data, value_.shape[0])
-
-    def cost_set(self, start_stage_, field_, value_, api='warn'):
-      self.cost_set_slice(start_stage_, start_stage_+1, field_, value_[None], api='warn')
-
-    def cost_set_slice(self, start_stage_, end_stage_, field_, value_, api='warn'):
+    def cost_set(self, stage_, field_, value_, api='warn'):
         """
         Set numerical data in the cost module of the solver.
 
@@ -1238,39 +1318,133 @@ class AcadosOcpSolver:
             :param value: of appropriate size
         """
         # cast value_ to avoid conversion issues
-        field = field_.encode('utf-8')
-        if len(value_.shape) > 2:
-          dim = value_.shape[1]*value_.shape[2]
-        else:
-          dim = value_.shape[1]
+        if isinstance(value_, (float, int)):
+            value_ = np.array([value_])
+        value_ = value_.astype(float)
 
-        self.shared_lib.ocp_nlp_cost_model_set_slice(self.nlp_config, \
-            self.nlp_dims, self.nlp_in, start_stage_, end_stage_, field,
-            cast(value_.ctypes.data, c_void_p), dim)
+        field = field_
+        field = field.encode('utf-8')
+
+        stage = c_int(stage_)
+        self.shared_lib.ocp_nlp_cost_dims_get_from_attr.argtypes = \
+            [c_void_p, c_void_p, c_void_p, c_int, c_char_p, POINTER(c_int)]
+        self.shared_lib.ocp_nlp_cost_dims_get_from_attr.restype = c_int
+
+        dims = np.ascontiguousarray(np.zeros((2,)), dtype=np.intc)
+        dims_data = cast(dims.ctypes.data, POINTER(c_int))
+
+        self.shared_lib.ocp_nlp_cost_dims_get_from_attr(self.nlp_config, \
+            self.nlp_dims, self.nlp_out, stage_, field, dims_data)
+
+        value_shape = value_.shape
+        if len(value_shape) == 1:
+            value_shape = (value_shape[0], 0)
+
+        elif len(value_shape) == 2:
+            if api=='old':
+                pass
+            elif api=='warn':
+                if not np.all(np.ravel(value_, order='F')==np.ravel(value_, order='K')):
+                    raise Exception("Ambiguity in API detected.\n"
+                                    "Are you making an acados model from scrach? Add api='new' to cost_set and carry on.\n"
+                                    "Are you seeing this error suddenly in previously running code? Read on.\n"
+                                    "  You are relying on a now-fixed bug in cost_set for field '{}'.\n".format(field_) +
+                                    "  acados_template now correctly passes on any matrices to acados in column major format.\n" +
+                                    "  Two options to fix this error: \n" +
+                                    "   * Add api='old' to cost_set to restore old incorrect behaviour\n" +
+                                    "   * Add api='new' to cost_set and remove any unnatural manipulation of the value argument " +
+                                    "such as non-mathematical transposes, reshaping, casting to fortran order, etc... " +
+                                    "If there is no such manipulation, then you have probably been getting an incorrect solution before.")
+                # Get elements in column major order
+                value_ = np.ravel(value_, order='F')
+            elif api=='new':
+                # Get elements in column major order
+                value_ = np.ravel(value_, order='F')
+            else:
+                raise Exception("Unknown api: '{}'".format(api))
+
+        if value_shape != tuple(dims):
+            raise Exception('AcadosOcpSolver.cost_set(): mismatching dimension', \
+                ' for field "{}" with dimension {} (you have {})'.format( \
+                field_, tuple(dims), value_shape))
+
+        value_data = cast(value_.ctypes.data, POINTER(c_double))
+        value_data_p = cast((value_data), c_void_p)
+
+        self.shared_lib.ocp_nlp_cost_model_set.argtypes = \
+            [c_void_p, c_void_p, c_void_p, c_int, c_char_p, c_void_p]
+        self.shared_lib.ocp_nlp_cost_model_set(self.nlp_config, \
+            self.nlp_dims, self.nlp_in, stage, field, value_data_p)
+
+        return
 
 
-    def constraints_set(self, start_stage_, field_, value_, api='warn'):
-      self.constraints_set_slice(start_stage_, start_stage_+1, field_, value_[None], api='warn')
-
-
-    def constraints_set_slice(self, start_stage_, end_stage_, field_, value_, api='warn'):
+    def constraints_set(self, stage_, field_, value_, api='warn'):
         """
         Set numerical data in the constraint module of the solver.
 
             :param stage: integer corresponding to shooting node
-            :param field: string in ['lbx', 'ubx', 'lbu', 'ubu', 'lg', 'ug', 'lh', 'uh', 'uphi']
+            :param field: string in ['lbx', 'ubx', 'lbu', 'ubu', 'lg', 'ug', 'lh', 'uh', 'uphi', 'C', 'D']
             :param value: of appropriate size
         """
+        # cast value_ to avoid conversion issues
+        if isinstance(value_, (float, int)):
+            value_ = np.array([value_])
+        value_ = value_.astype(float)
 
-        field = field_.encode('utf-8')
-        if len(value_.shape) > 2:
-          dim = value_.shape[1]*value_.shape[2]
-        else:
-          dim = value_.shape[1]
+        field = field_
+        field = field.encode('utf-8')
 
-        self.shared_lib.ocp_nlp_constraints_model_set_slice(self.nlp_config, \
-            self.nlp_dims, self.nlp_in, start_stage_, end_stage_, field,
-            cast(value_.ctypes.data, c_void_p), dim)
+        stage = c_int(stage_)
+        self.shared_lib.ocp_nlp_constraint_dims_get_from_attr.argtypes = \
+            [c_void_p, c_void_p, c_void_p, c_int, c_char_p, POINTER(c_int)]
+        self.shared_lib.ocp_nlp_constraint_dims_get_from_attr.restype = c_int
+
+        dims = np.ascontiguousarray(np.zeros((2,)), dtype=np.intc)
+        dims_data = cast(dims.ctypes.data, POINTER(c_int))
+
+        self.shared_lib.ocp_nlp_constraint_dims_get_from_attr(self.nlp_config, \
+            self.nlp_dims, self.nlp_out, stage_, field, dims_data)
+
+        value_shape = value_.shape
+        if len(value_shape) == 1:
+            value_shape = (value_shape[0], 0)
+        elif len(value_shape) == 2:
+            if api=='old':
+                pass
+            elif api=='warn':
+                if not np.all(np.ravel(value_, order='F')==np.ravel(value_, order='K')):
+                    raise Exception("Ambiguity in API detected.\n"
+                                    "Are you making an acados model from scrach? Add api='new' to constraints_set and carry on.\n"
+                                    "Are you seeing this error suddenly in previously running code? Read on.\n"
+                                    "  You are relying on a now-fixed bug in constraints_set for field '{}'.\n".format(field_) +
+                                    "  acados_template now correctly passes on any matrices to acados in column major format.\n" +
+                                    "  Two options to fix this error: \n" +
+                                    "   * Add api='old' to constraints_set to restore old incorrect behaviour\n" +
+                                    "   * Add api='new' to constraints_set and remove any unnatural manipulation of the value argument " +
+                                    "such as non-mathematical transposes, reshaping, casting to fortran order, etc... " +
+                                    "If there is no such manipulation, then you have probably been getting an incorrect solution before.")
+                # Get elements in column major order
+                value_ = np.ravel(value_, order='F')
+            elif api=='new':
+                # Get elements in column major order
+                value_ = np.ravel(value_, order='F')
+            else:
+                raise Exception("Unknown api: '{}'".format(api))
+
+        if value_shape != tuple(dims):
+            raise Exception('AcadosOcpSolver.constraints_set(): mismatching dimension' \
+                ' for field "{}" with dimension {} (you have {})'.format(field_, tuple(dims), value_shape))
+
+        value_data = cast(value_.ctypes.data, POINTER(c_double))
+        value_data_p = cast((value_data), c_void_p)
+
+        self.shared_lib.ocp_nlp_constraints_model_set.argtypes = \
+            [c_void_p, c_void_p, c_void_p, c_int, c_char_p, c_void_p]
+        self.shared_lib.ocp_nlp_constraints_model_set(self.nlp_config, \
+            self.nlp_dims, self.nlp_in, stage, field, value_data_p)
+
+        return
 
 
     def dynamics_get(self, stage_, field_):
diff --git a/pyextra/acados_template/acados_ocp_solver_pyx.pyx b/pyextra/acados_template/acados_ocp_solver_pyx.pyx
new file mode 100644
index 000000000..b950d880b
--- /dev/null
+++ b/pyextra/acados_template/acados_ocp_solver_pyx.pyx
@@ -0,0 +1,427 @@
+# -*- coding: future_fstrings -*-
+#
+# Copyright 2019 Gianluca Frison, Dimitris Kouzoupis, Robin Verschueren,
+# Andrea Zanelli, Niels van Duijkeren, Jonathan Frey, Tommaso Sartor,
+# Branimir Novoselnik, Rien Quirynen, Rezart Qelibari, Dang Doan,
+# Jonas Koenemann, Yutao Chen, Tobias Schöls, Jonas Schlagenhauf, Moritz Diehl
+#
+# This file is part of acados.
+#
+# The 2-Clause BSD License
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.;
+#
+# cython: language_level=3
+# cython: profile=False
+# distutils: language=c
+
+cimport cython
+from libc cimport string
+
+cimport acados_solver_common
+cimport acados_solver
+
+cimport numpy as cnp
+
+import os
+import numpy as np
+
+
+cdef class AcadosOcpSolverFast:
+    """
+    Class to interact with the acados ocp solver C object.
+
+        :param acados_ocp: type AcadosOcp - description of the OCP for acados
+        :param json_file: name for the json file used to render the templated code - default: acados_ocp_nlp.json
+        :param simulink_opts: Options to configure Simulink S-function blocks, mainly to activate possible Inputs and Outputs
+    """
+
+    cdef acados_solver.nlp_solver_capsule *capsule
+    cdef void *nlp_opts
+    cdef acados_solver_common.ocp_nlp_dims *nlp_dims
+    cdef acados_solver_common.ocp_nlp_config *nlp_config
+    cdef acados_solver_common.ocp_nlp_out *nlp_out
+    cdef acados_solver_common.ocp_nlp_in *nlp_in
+    cdef acados_solver_common.ocp_nlp_solver *nlp_solver
+
+    cdef str model_name
+    cdef int N
+    cdef bint solver_created
+
+    def __cinit__(self, str model_name, int N, str code_export_dir):
+        self.model_name = model_name
+        self.N = N
+
+        self.solver_created = False
+
+        # create capsule
+        self.capsule = acados_solver.acados_create_capsule()
+
+        # create solver
+        assert acados_solver.acados_create(self.capsule) == 0
+        self.solver_created = True
+
+        # get pointers solver
+        self.nlp_opts = acados_solver.acados_get_nlp_opts(self.capsule)
+        self.nlp_dims = acados_solver.acados_get_nlp_dims(self.capsule)
+        self.nlp_config = acados_solver.acados_get_nlp_config(self.capsule)
+        self.nlp_out = acados_solver.acados_get_nlp_out(self.capsule)
+        self.nlp_in = acados_solver.acados_get_nlp_in(self.capsule)
+        self.nlp_solver = acados_solver.acados_get_nlp_solver(self.capsule)
+
+
+    def solve(self):
+        """
+        Solve the ocp with current input.
+        """
+        return acados_solver.acados_solve(self.capsule)
+
+
+    def set_new_time_steps(self, new_time_steps):
+        """
+        Set new time steps before solving. Only reload library without code generation but with new time steps.
+
+            :param new_time_steps: vector of new time steps for the solver
+
+            .. note:: This allows for different use-cases: either set a new size of time-steps or a new distribution of
+                      the shooting nodes without changing the number, e.g., to reach a different final time. Both cases
+                      do not require a new code export and compilation.
+        """
+        raise NotImplementedError()
+
+
+    def get(self, int stage, str field_):
+        """
+        Get the last solution of the solver:
+
+            :param stage: integer corresponding to shooting node
+            :param field: string in ['x', 'u', 'z', 'pi', 'lam', 't', 'sl', 'su',]
+
+            .. note:: regarding lam, t: \n
+                    the inequalities are internally organized in the following order: \n
+                    [ lbu lbx lg lh lphi ubu ubx ug uh uphi; \n
+                      lsbu lsbx lsg lsh lsphi usbu usbx usg ush usphi]
+
+            .. note:: pi: multipliers for dynamics equality constraints \n
+                      lam: multipliers for inequalities \n
+                      t: slack variables corresponding to evaluation of all inequalities (at the solution) \n
+                      sl: slack variables of soft lower inequality constraints \n
+                      su: slack variables of soft upper inequality constraints \n
+        """
+
+        out_fields = ['x', 'u', 'z', 'pi', 'lam', 't', 'sl', 'su']
+        field = field_.encode('utf-8')
+
+        if field_ not in out_fields:
+            raise Exception('AcadosOcpSolver.get(): {} is an invalid argument.\
+                    \n Possible values are {}. Exiting.'.format(field_, out_fields))
+
+        if stage < 0 or stage > self.N:
+            raise Exception('AcadosOcpSolver.get(): stage index must be in [0, N], got: {}.'.format(self.N))
+
+        if stage == self.N and field_ == 'pi':
+            raise Exception('AcadosOcpSolver.get(): field {} does not exist at final stage {}.'\
+                .format(field_, stage))
+
+        cdef int dims = acados_solver_common.ocp_nlp_dims_get_from_attr(self.nlp_config,
+            self.nlp_dims, self.nlp_out, stage, field)
+
+        cdef cnp.ndarray[cnp.float64_t, ndim=1] out = np.zeros((dims,))
+        acados_solver_common.ocp_nlp_out_get(self.nlp_config, \
+            self.nlp_dims, self.nlp_out, stage, field, <void *> out.data)
+
+        return out
+
+
+    def print_statistics(self):
+        """
+        prints statistics of previous solver run as a table:
+            - iter: iteration number
+            - res_stat: stationarity residual
+            - res_eq: residual wrt equality constraints (dynamics)
+            - res_ineq: residual wrt inequality constraints (constraints)
+            - res_comp: residual wrt complementarity conditions
+            - qp_stat: status of QP solver
+            - qp_iter: number of QP iterations
+            - qp_res_stat: stationarity residual of the last QP solution
+            - qp_res_eq: residual wrt equality constraints (dynamics) of the last QP solution
+            - qp_res_ineq: residual wrt inequality constraints (constraints)  of the last QP solution
+            - qp_res_comp: residual wrt complementarity conditions of the last QP solution
+        """
+        raise NotImplementedError()
+
+
+    def store_iterate(self, filename='', overwrite=False):
+        """
+        Stores the current iterate of the ocp solver in a json file.
+
+            :param filename: if not set, use model_name + timestamp + '.json'
+            :param overwrite: if false and filename exists add timestamp to filename
+        """
+        raise NotImplementedError()
+
+
+    def load_iterate(self, filename):
+        """
+        Loads the iterate stored in json file with filename into the ocp solver.
+        """
+        raise NotImplementedError()
+
+
+    def get_stats(self, field_):
+        """
+        Get the information of the last solver call.
+
+            :param field: string in ['statistics', 'time_tot', 'time_lin', 'time_sim', 'time_sim_ad', 'time_sim_la', 'time_qp', 'time_qp_solver_call', 'time_reg', 'sqp_iter']
+        """
+        raise NotImplementedError()
+
+
+    def get_cost(self):
+        """
+        Returns the cost value of the current solution.
+        """
+        # compute cost internally
+        acados_solver_common.ocp_nlp_eval_cost(self.nlp_solver, self.nlp_in, self.nlp_out)
+
+        # create output
+        cdef double out
+
+        # call getter
+        acados_solver_common.ocp_nlp_get(self.nlp_config, self.nlp_solver, "cost_value", <void *> &out)
+
+        return out
+
+
+    def get_residuals(self):
+        """
+        Returns an array of the form [res_stat, res_eq, res_ineq, res_comp].
+        """
+        raise NotImplementedError()
+
+
+    # Note: this function should not be used anymore, better use cost_set, constraints_set
+    def set(self, int stage, str field_, value_):
+
+        """
+        Set numerical data inside the solver.
+
+            :param stage: integer corresponding to shooting node
+            :param field: string in ['x', 'u', 'pi', 'lam', 't', 'p']
+
+            .. note:: regarding lam, t: \n
+                    the inequalities are internally organized in the following order: \n
+                    [ lbu lbx lg lh lphi ubu ubx ug uh uphi; \n
+                      lsbu lsbx lsg lsh lsphi usbu usbx usg ush usphi]
+
+            .. note:: pi: multipliers for dynamics equality constraints \n
+                      lam: multipliers for inequalities \n
+                      t: slack variables corresponding to evaluation of all inequalities (at the solution) \n
+                      sl: slack variables of soft lower inequality constraints \n
+                      su: slack variables of soft upper inequality constraints \n
+        """
+        cost_fields = ['y_ref', 'yref']
+        constraints_fields = ['lbx', 'ubx', 'lbu', 'ubu']
+        out_fields = ['x', 'u', 'pi', 'lam', 't', 'z', 'sl', 'su']
+
+        field = field_.encode('utf-8')
+
+        cdef double[::1] value
+
+        # treat parameters separately
+        if field_ == 'p':
+            value = np.ascontiguousarray(value_, dtype=np.double)
+            assert acados_solver.acados_update_params(self.capsule, stage, <double *> &value[0], value.shape[0]) == 0
+        else:
+            if field_ not in constraints_fields + cost_fields + out_fields:
+                raise Exception("AcadosOcpSolver.set(): {} is not a valid argument.\
+                    \nPossible values are {}. Exiting.".format(field, \
+                    constraints_fields + cost_fields + out_fields + ['p']))
+
+            dims = acados_solver_common.ocp_nlp_dims_get_from_attr(self.nlp_config,
+                self.nlp_dims, self.nlp_out, stage, field)
+
+            if value_.shape[0] != dims:
+                msg = 'AcadosOcpSolver.set(): mismatching dimension for field "{}" '.format(field_)
+                msg += 'with dimension {} (you have {})'.format(dims, value_.shape[0])
+                raise Exception(msg)
+
+            value = np.ascontiguousarray(value_, dtype=np.double)
+            if field_ in constraints_fields:
+                acados_solver_common.ocp_nlp_constraints_model_set(self.nlp_config,
+                    self.nlp_dims, self.nlp_in, stage, field, <void *> &value[0])
+            elif field_ in cost_fields:
+                acados_solver_common.ocp_nlp_cost_model_set(self.nlp_config,
+                    self.nlp_dims, self.nlp_in, stage, field, <void *> &value[0])
+            elif field_ in out_fields:
+                acados_solver_common.ocp_nlp_out_set(self.nlp_config,
+                    self.nlp_dims, self.nlp_out, stage, field, <void *> &value[0])
+
+
+    def cost_set(self, int stage, str field_, value_):
+        """
+        Set numerical data in the cost module of the solver.
+
+            :param stage: integer corresponding to shooting node
+            :param field: string, e.g. 'yref', 'W', 'ext_cost_num_hess'
+            :param value: of appropriate size
+        """
+        field = field_.encode('utf-8')
+
+        cdef int dims[2]
+        acados_solver_common.ocp_nlp_cost_dims_get_from_attr(self.nlp_config, \
+            self.nlp_dims, self.nlp_out, stage, field, &dims[0])
+
+        cdef double[::1,:] value
+
+        value_shape = value_.shape
+        if len(value_shape) == 1:
+            value_shape = (value_shape[0], 0)
+            value = np.asfortranarray(value_[None,:])
+
+        elif len(value_shape) == 2:
+            # Get elements in column major order
+            value = np.asfortranarray(value_)
+
+        if value_shape[0] != dims[0] or value_shape[1] != dims[1]:
+            raise Exception('AcadosOcpSolver.cost_set(): mismatching dimension', \
+                ' for field "{}" with dimension {} (you have {})'.format( \
+                field_, tuple(dims), value_shape))
+
+        acados_solver_common.ocp_nlp_cost_model_set(self.nlp_config, \
+            self.nlp_dims, self.nlp_in, stage, field, <void *> &value[0][0])
+
+
+    def constraints_set(self, int stage, str field_, value_):
+        """
+        Set numerical data in the constraint module of the solver.
+
+            :param stage: integer corresponding to shooting node
+            :param field: string in ['lbx', 'ubx', 'lbu', 'ubu', 'lg', 'ug', 'lh', 'uh', 'uphi', 'C', 'D']
+            :param value: of appropriate size
+        """
+        field = field_.encode('utf-8')
+
+        cdef int dims[2]
+        acados_solver_common.ocp_nlp_constraint_dims_get_from_attr(self.nlp_config, \
+            self.nlp_dims, self.nlp_out, stage, field, &dims[0])
+
+        cdef double[::1,:] value
+
+        value_shape = value_.shape
+        if len(value_shape) == 1:
+            value_shape = (value_shape[0], 0)
+            value = np.asfortranarray(value_[None,:])
+
+        elif len(value_shape) == 2:
+            # Get elements in column major order
+            value = np.asfortranarray(value_)
+
+        if value_shape[0] != dims[0] or value_shape[1] != dims[1]:
+            raise Exception('AcadosOcpSolver.constraints_set(): mismatching dimension' \
+                ' for field "{}" with dimension {} (you have {})'.format(field_, tuple(dims), value_shape))
+
+        acados_solver_common.ocp_nlp_constraints_model_set(self.nlp_config, \
+            self.nlp_dims, self.nlp_in, stage, field, <void *> &value[0][0])
+
+        return
+
+
+    def dynamics_get(self, int stage, str field_):
+        """
+        Get numerical data from the dynamics module of the solver:
+
+            :param stage: integer corresponding to shooting node
+            :param field: string, e.g. 'A'
+        """
+        field = field_.encode('utf-8')
+
+        # get dims
+        cdef int[2] dims
+        acados_solver_common.ocp_nlp_dynamics_dims_get_from_attr(self.nlp_config, self.nlp_dims, self.nlp_out, stage, field, &dims[0])
+
+        # create output data
+        out = np.zeros((dims[0], dims[1]), order='F', dtype=np.float64)
+
+        # call getter
+        acados_solver_common.ocp_nlp_get_at_stage(self.nlp_config, self.nlp_dims, self.nlp_solver, stage, field, <void *> out.data)
+
+        return out
+
+
+    def options_set(self, str field_, value_):
+        """
+        Set options of the solver.
+
+            :param field: string, e.g. 'print_level', 'rti_phase', 'initialize_t_slacks', 'step_length', 'alpha_min', 'alpha_reduction'
+            :param value: of type int, float
+        """
+        int_fields = ['print_level', 'rti_phase', 'initialize_t_slacks']
+        double_fields = ['step_length', 'tol_eq', 'tol_stat', 'tol_ineq', 'tol_comp', 'alpha_min', 'alpha_reduction']
+        string_fields = ['globalization']
+
+        # encode
+        field = field_.encode('utf-8')
+
+        cdef int int_value
+        cdef double double_value
+        cdef unsigned char[::1] string_value
+
+        # check field availability and type
+        if field_ in int_fields:
+            if not isinstance(value_, int):
+                raise Exception('solver option {} must be of type int. You have {}.'.format(field_, type(value_)))
+
+            if field_ == 'rti_phase':
+                if value_ < 0 or value_ > 2:
+                    raise Exception('AcadosOcpSolver.solve(): argument \'rti_phase\' can '
+                        'take only values 0, 1, 2 for SQP-RTI-type solvers')
+                if self.acados_ocp.solver_options.nlp_solver_type != 'SQP_RTI' and value_ > 0:
+                    raise Exception('AcadosOcpSolver.solve(): argument \'rti_phase\' can '
+                        'take only value 0 for SQP-type solvers')
+
+            int_value = value_
+            acados_solver_common.ocp_nlp_solver_opts_set(self.nlp_config, self.nlp_opts, field, <void *> &int_value)
+
+        elif field_ in double_fields:
+            if not isinstance(value_, float):
+                raise Exception('solver option {} must be of type float. You have {}.'.format(field_, type(value_)))
+
+            double_value = value_
+            acados_solver_common.ocp_nlp_solver_opts_set(self.nlp_config, self.nlp_opts, field, <void *> &double_value)
+
+        elif field_ in string_fields:
+            if not isinstance(value_, bytes):
+                raise Exception('solver option {} must be of type str. You have {}.'.format(field_, type(value_)))
+
+            string_value = value_.encode('utf-8')
+            acados_solver_common.ocp_nlp_solver_opts_set(self.nlp_config, self.nlp_opts, field, <void *> &string_value[0])
+
+        raise Exception('AcadosOcpSolver.options_set() does not support field {}.'\
+            '\n Possible values are {}.'.format(field_, ', '.join(int_fields + double_fields + string_fields)))
+
+
+    def __del__(self):
+        if self.solver_created:
+            acados_solver.acados_free(self.capsule)
+            acados_solver.acados_free_capsule(self.capsule)
diff --git a/pyextra/acados_template/acados_sim.py b/pyextra/acados_template/acados_sim.py
index 3ae198865..d7ad1487d 100644
--- a/pyextra/acados_template/acados_sim.py
+++ b/pyextra/acados_template/acados_sim.py
@@ -106,7 +106,8 @@ class AcadosSimOpts:
     """
     def __init__(self):
         self.__integrator_type = 'ERK'
-        self.__tf = None
+        self.__collocation_type = 'GAUSS_LEGENDRE'
+        self.__Tsim = None
         # ints
         self.__sim_method_num_stages = 1
         self.__sim_method_num_steps = 1
@@ -174,6 +175,15 @@ class AcadosSimOpts:
         """Time horizon"""
         return self.__Tsim
 
+    @property
+    def collocation_type(self):
+        """Collocation type: relevant for implicit integrators
+        -- string in {GAUSS_RADAU_IIA, GAUSS_LEGENDRE}
+
+        Default: GAUSS_LEGENDRE
+        """
+        return self.__collocation_type
+
     @integrator_type.setter
     def integrator_type(self, integrator_type):
         integrator_types = ('ERK', 'IRK', 'GNSF')
@@ -183,6 +193,15 @@ class AcadosSimOpts:
             raise Exception('Invalid integrator_type value. Possible values are:\n\n' \
                     + ',\n'.join(integrator_types) + '.\n\nYou have: ' + integrator_type + '.\n\nExiting.')
 
+    @collocation_type.setter
+    def collocation_type(self, collocation_type):
+        collocation_types = ('GAUSS_RADAU_IIA', 'GAUSS_LEGENDRE')
+        if collocation_type in collocation_types:
+            self.__collocation_type = collocation_type
+        else:
+            raise Exception('Invalid collocation_type value. Possible values are:\n\n' \
+                    + ',\n'.join(collocation_types) + '.\n\nYou have: ' + collocation_type + '.\n\nExiting.')
+
     @T.setter
     def T(self, T):
         self.__Tsim = T
@@ -262,6 +281,8 @@ class AcadosSim:
 
     - :py:attr:`acados_include_path` (set automatically)
     - :py:attr:`acados_lib_path` (set automatically)
+    - :py:attr:`parameter_values` - used to initialize the parameters (can be changed)
+
     """
     def __init__(self, acados_path=''):
         if acados_path == '':
@@ -281,6 +302,21 @@ class AcadosSim:
         self.code_export_directory = 'c_generated_code'
         """Path to where code will be exported. Default: `c_generated_code`."""
 
+        self.__parameter_values = np.array([])
+
+    @property
+    def parameter_values(self):
+        """:math:`p` - initial values for parameter - can be updated"""
+        return self.__parameter_values
+
+    @parameter_values.setter
+    def parameter_values(self, parameter_values):
+        if isinstance(parameter_values, np.ndarray):
+            self.__parameter_values = parameter_values
+        else:
+            raise Exception('Invalid parameter_values value. ' +
+                            f'Expected numpy array, got {type(parameter_values)}.')
+
     def set(self, attr, value):
         # tokenize string
         tokens = attr.split('_', 1)
diff --git a/pyextra/acados_template/acados_sim_layout.json b/pyextra/acados_template/acados_sim_layout.json
index c8f4280b8..25b149613 100644
--- a/pyextra/acados_template/acados_sim_layout.json
+++ b/pyextra/acados_template/acados_sim_layout.json
@@ -28,6 +28,9 @@
         "integrator_type": [
             "str"
         ],
+        "collocation_type": [
+            "str"
+        ],
         "Tsim": [
             "float"
         ],
diff --git a/pyextra/acados_template/acados_sim_solver.py b/pyextra/acados_template/acados_sim_solver.py
index a5447907b..145f90293 100644
--- a/pyextra/acados_template/acados_sim_solver.py
+++ b/pyextra/acados_template/acados_sim_solver.py
@@ -46,7 +46,7 @@ from .acados_sim import AcadosSim
 from .acados_ocp import AcadosOcp
 from .acados_model import acados_model_strip_casadi_symbolics
 from .utils import is_column, render_template, format_class_dict, np_array_to_list,\
-     make_model_consistent, set_up_imported_gnsf_model
+     make_model_consistent, set_up_imported_gnsf_model, get_python_interface_path
 
 
 def make_sim_dims_consistent(acados_sim):
@@ -84,14 +84,13 @@ def make_sim_dims_consistent(acados_sim):
 
 
 def get_sim_layout():
-    current_module = sys.modules[__name__]
-    acados_path = os.path.dirname(current_module.__file__)
-    with open(acados_path + '/acados_sim_layout.json', 'r') as f:
+    python_interface_path = get_python_interface_path()
+    abs_path = os.path.join(python_interface_path, 'acados_sim_layout.json')
+    with open(abs_path, 'r') as f:
         sim_layout = json.load(f)
     return sim_layout
 
 
-
 def sim_formulation_json_dump(acados_sim, json_file='acados_sim.json'):
     # Load acados_sim structure description
     sim_layout = get_sim_layout()
@@ -114,9 +113,7 @@ def sim_formulation_json_dump(acados_sim, json_file='acados_sim.json'):
 
 def sim_render_templates(json_file, model_name, code_export_dir):
     # setting up loader and environment
-    json_path = '{cwd}/{json_file}'.format(
-        cwd=os.getcwd(),
-        json_file=json_file)
+    json_path = os.path.join(os.getcwd(), json_file)
 
     if not os.path.exists(json_path):
         raise Exception(f"{json_path} not found!")
@@ -141,7 +138,7 @@ def sim_render_templates(json_file, model_name, code_export_dir):
     render_template(in_file, out_file, template_dir, json_path)
 
     ## folder model
-    template_dir = f'{code_export_dir}/{model_name}_model/'
+    template_dir = os.path.join(code_export_dir, model_name + '_model')
 
     in_file = 'model.in.h'
     out_file = f'{model_name}_model.h'
diff --git a/pyextra/acados_template/acados_solver_common.pxd b/pyextra/acados_template/acados_solver_common.pxd
new file mode 100644
index 000000000..9314802e6
--- /dev/null
+++ b/pyextra/acados_template/acados_solver_common.pxd
@@ -0,0 +1,102 @@
+# -*- coding: future_fstrings -*-
+#
+# Copyright 2019 Gianluca Frison, Dimitris Kouzoupis, Robin Verschueren,
+# Andrea Zanelli, Niels van Duijkeren, Jonathan Frey, Tommaso Sartor,
+# Branimir Novoselnik, Rien Quirynen, Rezart Qelibari, Dang Doan,
+# Jonas Koenemann, Yutao Chen, Tobias Schöls, Jonas Schlagenhauf, Moritz Diehl
+#
+# This file is part of acados.
+#
+# The 2-Clause BSD License
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.;
+#
+
+
+cdef extern from "acados/ocp_nlp/ocp_nlp_common.h":
+    ctypedef struct ocp_nlp_config:
+        pass
+
+    ctypedef struct ocp_nlp_dims:
+        pass
+
+    ctypedef struct ocp_nlp_in:
+        pass
+
+    ctypedef struct ocp_nlp_out:
+        pass
+
+
+cdef extern from "acados_c/ocp_nlp_interface.h":
+    ctypedef enum ocp_nlp_solver_t:
+        pass
+
+    ctypedef enum ocp_nlp_cost_t:
+        pass
+
+    ctypedef enum ocp_nlp_dynamics_t:
+        pass
+
+    ctypedef enum ocp_nlp_constraints_t:
+        pass
+
+    ctypedef enum ocp_nlp_reg_t:
+        pass
+
+    ctypedef struct ocp_nlp_plan:
+        pass
+
+    ctypedef struct ocp_nlp_solver:
+        pass
+
+    int ocp_nlp_cost_model_set(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_in *in_,
+        int start_stage, const char *field, void *value)
+    int ocp_nlp_constraints_model_set(ocp_nlp_config *config, ocp_nlp_dims *dims,
+        ocp_nlp_in *in_, int stage, const char *field, void *value)
+
+    # out
+    void ocp_nlp_out_set(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_out *out,
+        int stage, const char *field, void *value)
+    void ocp_nlp_out_get(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_out *out,
+        int stage, const char *field, void *value)
+    void ocp_nlp_get_at_stage(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_solver *solver,
+        int stage, const char *field, void *value)
+    int ocp_nlp_dims_get_from_attr(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_out *out,
+        int stage, const char *field)
+    void ocp_nlp_constraint_dims_get_from_attr(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_out *out,
+        int stage, const char *field, int *dims_out)
+    void ocp_nlp_cost_dims_get_from_attr(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_out *out,
+        int stage, const char *field, int *dims_out)
+    void ocp_nlp_dynamics_dims_get_from_attr(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_out *out,
+        int stage, const char *field, int *dims_out)
+
+    # opts
+    void ocp_nlp_solver_opts_set(ocp_nlp_config *config, void *opts_, const char *field, void* value)
+
+    # solver
+    void ocp_nlp_eval_residuals(ocp_nlp_solver *solver, ocp_nlp_in *nlp_in, ocp_nlp_out *nlp_out)
+    void ocp_nlp_eval_cost(ocp_nlp_solver *solver, ocp_nlp_in *nlp_in_, ocp_nlp_out *nlp_out)
+
+    # get/set
+    void ocp_nlp_get(ocp_nlp_config *config, ocp_nlp_solver *solver, const char *field, void *return_value_)
+    void ocp_nlp_set(ocp_nlp_config *config, ocp_nlp_solver *solver, int stage, const char *field, void *value)
diff --git a/pyextra/acados_template/c_templates_tera/Makefile.in b/pyextra/acados_template/c_templates_tera/Makefile.in
index 330a5aba1..487e66ab0 100644
--- a/pyextra/acados_template/c_templates_tera/Makefile.in
+++ b/pyextra/acados_template/c_templates_tera/Makefile.in
@@ -279,12 +279,12 @@ all: clean casadi_fun example_sim example
 shared_lib: bundled_shared_lib ocp_shared_lib sim_shared_lib
 {%- endif %}
 
-CASADI_MODEL_SOURCE= 
+CASADI_MODEL_SOURCE=
 {%- if solver_options.integrator_type == "ERK" %}
 CASADI_MODEL_SOURCE+= {{ model.name }}_expl_ode_fun.c
-CASADI_MODEL_SOURCE+= {{ model.name }}_expl_vde_forw.c 
+CASADI_MODEL_SOURCE+= {{ model.name }}_expl_vde_forw.c
 {%- if hessian_approx == "EXACT" %}
-CASADI_MODEL_SOURCE+= {{ model.name }}_expl_ode_hess.c 
+CASADI_MODEL_SOURCE+= {{ model.name }}_expl_ode_hess.c
 {%- endif %}
 {%- elif solver_options.integrator_type == "IRK" %}
 CASADI_MODEL_SOURCE+= {{ model.name }}_impl_dae_fun.c
@@ -453,7 +453,7 @@ ocp_shared_lib: casadi_fun ocp_solver
 	-L $(LIB_PATH) -lacados -lhpipm -lblasfeo \
 	{{ link_libs }} \
 	-lm \
-	
+
 {%- else %}
 
 ocp_shared_lib: casadi_fun ocp_solver
@@ -465,9 +465,34 @@ ocp_shared_lib: casadi_fun ocp_solver
 	-L $(LIB_PATH) -lacados -lhpipm -lblasfeo \
 	{{ link_libs }} \
 	-lm \
-	
+
 {%- endif %}
 
+ocp_cython_c: ocp_shared_lib
+	cython \
+	-o acados_ocp_solver_pyx.c \
+	-I $(INCLUDE_PATH)/../interfaces/acados_template/acados_template \
+	$(INCLUDE_PATH)/../interfaces/acados_template/acados_template/acados_ocp_solver_pyx.pyx \
+
+ocp_cython_o: ocp_cython_c
+	clang $(ACADOS_FLAGS) -c -O2 \
+	-o acados_ocp_solver_pyx.o \
+	-I /usr/include/python3.8 \
+	-I $(INCLUDE_PATH)/blasfeo/include/ \
+	-I $(INCLUDE_PATH)/hpipm/include/ \
+	-I $(INCLUDE_PATH) \
+	acados_ocp_solver_pyx.c \
+
+ocp_cython: ocp_cython_o
+	clang $(ACADOS_FLAGS) -shared \
+	-o acados_ocp_solver_pyx.so \
+	-Wl,-rpath=$(LIB_PATH) \
+	acados_ocp_solver_pyx.o \
+	$(abspath .)/libacados_ocp_solver_{{ model.name }}.so \
+	-L $(LIB_PATH) -lacados -lhpipm -lblasfeo -lqpOASES_e \
+	{{ link_libs }} \
+	-lm \
+
 sim_shared_lib: casadi_fun sim_solver
 	gcc $(ACADOS_FLAGS) -shared -o libacados_sim_solver_{{ model.name }}.so $(SIM_OBJ) $(MODEL_OBJ) -L$(EXTERNAL_DIR) -l$(EXTERNAL_LIB) \
 	-L $(LIB_PATH) -lacados -lhpipm -lblasfeo \
diff --git a/pyextra/acados_template/c_templates_tera/acados_mex_create.in.c b/pyextra/acados_template/c_templates_tera/acados_mex_create.in.c
index 7fa08d8f7..251b249b2 100644
--- a/pyextra/acados_template/c_templates_tera/acados_mex_create.in.c
+++ b/pyextra/acados_template/c_templates_tera/acados_mex_create.in.c
@@ -52,7 +52,7 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
     int status = 0;
 
     // create solver
-    nlp_solver_capsule *acados_ocp_capsule = {{ model.name }}_acados_create_capsule();
+    {{ model.name }}_solver_capsule *acados_ocp_capsule = {{ model.name }}_acados_create_capsule();
 
     status = {{ model.name }}_acados_create(acados_ocp_capsule);
 
diff --git a/pyextra/acados_template/c_templates_tera/acados_mex_free.in.c b/pyextra/acados_template/c_templates_tera/acados_mex_free.in.c
index 843440297..560adb0b9 100644
--- a/pyextra/acados_template/c_templates_tera/acados_mex_free.in.c
+++ b/pyextra/acados_template/c_templates_tera/acados_mex_free.in.c
@@ -51,7 +51,7 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
     const mxArray *C_ocp = prhs[0];
     // capsule
     ptr = (long long *) mxGetData( mxGetField( C_ocp, 0, "capsule" ) );
-    nlp_solver_capsule *capsule = (nlp_solver_capsule *) ptr[0];
+    {{ model.name }}_solver_capsule *capsule = ({{ model.name }}_solver_capsule *) ptr[0];
 
     status = {{ model.name }}_acados_free(capsule);
     if (status)
diff --git a/pyextra/acados_template/c_templates_tera/acados_mex_set.in.c b/pyextra/acados_template/c_templates_tera/acados_mex_set.in.c
index ee86a3116..ae8c31155 100644
--- a/pyextra/acados_template/c_templates_tera/acados_mex_set.in.c
+++ b/pyextra/acados_template/c_templates_tera/acados_mex_set.in.c
@@ -66,7 +66,7 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
     const mxArray *C_ocp = prhs[2];
     // capsule
     ptr = (long long *) mxGetData( mxGetField( C_ocp, 0, "capsule" ) );
-    nlp_solver_capsule *capsule = (nlp_solver_capsule *) ptr[0];
+    {{ model.name }}_solver_capsule *capsule = ({{ model.name }}_solver_capsule *) ptr[0];
     // plan
     ptr = (long long *) mxGetData( mxGetField( C_ocp, 0, "plan" ) );
     ocp_nlp_plan *plan = (ocp_nlp_plan *) ptr[0];
diff --git a/pyextra/acados_template/c_templates_tera/acados_mex_solve.in.c b/pyextra/acados_template/c_templates_tera/acados_mex_solve.in.c
index 9fd4feab2..c673d4bf2 100644
--- a/pyextra/acados_template/c_templates_tera/acados_mex_solve.in.c
+++ b/pyextra/acados_template/c_templates_tera/acados_mex_solve.in.c
@@ -51,7 +51,7 @@ void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
 
     // capsule
     ptr = (long long *) mxGetData( mxGetField( C_ocp, 0, "capsule" ) );
-    nlp_solver_capsule *capsule = (nlp_solver_capsule *) ptr[0];
+    {{ model.name }}_solver_capsule *capsule = ({{ model.name }}_solver_capsule *) ptr[0];
 
     // solve
     {{ model.name }}_acados_solve(capsule);
diff --git a/pyextra/acados_template/c_templates_tera/acados_sim_solver.in.c b/pyextra/acados_template/c_templates_tera/acados_sim_solver.in.c
index 390506420..971fb8a78 100644
--- a/pyextra/acados_template/c_templates_tera/acados_sim_solver.in.c
+++ b/pyextra/acados_template/c_templates_tera/acados_sim_solver.in.c
@@ -78,9 +78,10 @@ int {{ model.name }}_acados_sim_solver_free_capsule(sim_solver_capsule * capsule
 int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
 {
     // initialize
-    int nx = {{ dims.nx }};
-    int nu = {{ dims.nu }};
-    int nz = {{ dims.nz }};
+    const int nx = {{ model.name | upper }}_NX;
+    const int nu = {{ model.name | upper }}_NU;
+    const int nz = {{ model.name | upper }}_NZ;
+    const int np = {{ model.name | upper }}_NP;
     bool tmp_bool;
 
     {#// double Tsim = {{ solver_options.tf / dims.N }};#}
@@ -98,7 +99,7 @@ int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
     capsule->sim_impl_dae_fun->casadi_sparsity_out = &{{ model.name }}_impl_dae_fun_sparsity_out;
     capsule->sim_impl_dae_fun->casadi_n_in = &{{ model.name }}_impl_dae_fun_n_in;
     capsule->sim_impl_dae_fun->casadi_n_out = &{{ model.name }}_impl_dae_fun_n_out;
-    external_function_param_casadi_create(capsule->sim_impl_dae_fun, {{ dims.np }});
+    external_function_param_casadi_create(capsule->sim_impl_dae_fun, np);
 
     capsule->sim_impl_dae_fun_jac_x_xdot_z->casadi_fun = &{{ model.name }}_impl_dae_fun_jac_x_xdot_z;
     capsule->sim_impl_dae_fun_jac_x_xdot_z->casadi_work = &{{ model.name }}_impl_dae_fun_jac_x_xdot_z_work;
@@ -106,7 +107,7 @@ int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
     capsule->sim_impl_dae_fun_jac_x_xdot_z->casadi_sparsity_out = &{{ model.name }}_impl_dae_fun_jac_x_xdot_z_sparsity_out;
     capsule->sim_impl_dae_fun_jac_x_xdot_z->casadi_n_in = &{{ model.name }}_impl_dae_fun_jac_x_xdot_z_n_in;
     capsule->sim_impl_dae_fun_jac_x_xdot_z->casadi_n_out = &{{ model.name }}_impl_dae_fun_jac_x_xdot_z_n_out;
-    external_function_param_casadi_create(capsule->sim_impl_dae_fun_jac_x_xdot_z, {{ dims.np }});
+    external_function_param_casadi_create(capsule->sim_impl_dae_fun_jac_x_xdot_z, np);
 
     // external_function_param_casadi impl_dae_jac_x_xdot_u_z;
     capsule->sim_impl_dae_jac_x_xdot_u_z->casadi_fun = &{{ model.name }}_impl_dae_jac_x_xdot_u_z;
@@ -115,7 +116,7 @@ int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
     capsule->sim_impl_dae_jac_x_xdot_u_z->casadi_sparsity_out = &{{ model.name }}_impl_dae_jac_x_xdot_u_z_sparsity_out;
     capsule->sim_impl_dae_jac_x_xdot_u_z->casadi_n_in = &{{ model.name }}_impl_dae_jac_x_xdot_u_z_n_in;
     capsule->sim_impl_dae_jac_x_xdot_u_z->casadi_n_out = &{{ model.name }}_impl_dae_jac_x_xdot_u_z_n_out;
-    external_function_param_casadi_create(capsule->sim_impl_dae_jac_x_xdot_u_z, {{ dims.np }});
+    external_function_param_casadi_create(capsule->sim_impl_dae_jac_x_xdot_u_z, np);
 
 {%- if hessian_approx == "EXACT" %}
     capsule->sim_impl_dae_hess = (external_function_param_casadi *) malloc(sizeof(external_function_param_casadi));
@@ -126,7 +127,7 @@ int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
     capsule->sim_impl_dae_hess->casadi_sparsity_out = &{{ model.name }}_impl_dae_hess_sparsity_out;
     capsule->sim_impl_dae_hess->casadi_n_in = &{{ model.name }}_impl_dae_hess_n_in;
     capsule->sim_impl_dae_hess->casadi_n_out = &{{ model.name }}_impl_dae_hess_n_out;
-    external_function_param_casadi_create(capsule->sim_impl_dae_hess, {{ dims.np }});
+    external_function_param_casadi_create(capsule->sim_impl_dae_hess, np);
 {%- endif %}
 
     {% elif solver_options.integrator_type == "ERK" %}
@@ -140,7 +141,7 @@ int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
     capsule->sim_forw_vde_casadi->casadi_sparsity_in = &{{ model.name }}_expl_vde_forw_sparsity_in;
     capsule->sim_forw_vde_casadi->casadi_sparsity_out = &{{ model.name }}_expl_vde_forw_sparsity_out;
     capsule->sim_forw_vde_casadi->casadi_work = &{{ model.name }}_expl_vde_forw_work;
-    external_function_param_casadi_create(capsule->sim_forw_vde_casadi, {{ dims.np }});
+    external_function_param_casadi_create(capsule->sim_forw_vde_casadi, np);
 
     capsule->sim_expl_ode_fun_casadi->casadi_fun = &{{ model.name }}_expl_ode_fun;
     capsule->sim_expl_ode_fun_casadi->casadi_n_in = &{{ model.name }}_expl_ode_fun_n_in;
@@ -148,7 +149,7 @@ int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
     capsule->sim_expl_ode_fun_casadi->casadi_sparsity_in = &{{ model.name }}_expl_ode_fun_sparsity_in;
     capsule->sim_expl_ode_fun_casadi->casadi_sparsity_out = &{{ model.name }}_expl_ode_fun_sparsity_out;
     capsule->sim_expl_ode_fun_casadi->casadi_work = &{{ model.name }}_expl_ode_fun_work;
-    external_function_param_casadi_create(capsule->sim_expl_ode_fun_casadi, {{ dims.np }});
+    external_function_param_casadi_create(capsule->sim_expl_ode_fun_casadi, np);
 
 {%- if hessian_approx == "EXACT" %}
     capsule->sim_expl_ode_hess = (external_function_param_casadi *) malloc(sizeof(external_function_param_casadi));
@@ -159,7 +160,7 @@ int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
     capsule->sim_expl_ode_hess->casadi_sparsity_out = &{{ model.name }}_expl_ode_hess_sparsity_out;
     capsule->sim_expl_ode_hess->casadi_n_in = &{{ model.name }}_expl_ode_hess_n_in;
     capsule->sim_expl_ode_hess->casadi_n_out = &{{ model.name }}_expl_ode_hess_n_out;
-    external_function_param_casadi_create(capsule->sim_expl_ode_hess, {{ dims.np }});
+    external_function_param_casadi_create(capsule->sim_expl_ode_hess, np);
 {%- endif %}
 
     {% elif solver_options.integrator_type == "GNSF" -%}
@@ -175,7 +176,7 @@ int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
     capsule->sim_gnsf_phi_fun->casadi_sparsity_in = &{{ model.name }}_gnsf_phi_fun_sparsity_in;
     capsule->sim_gnsf_phi_fun->casadi_sparsity_out = &{{ model.name }}_gnsf_phi_fun_sparsity_out;
     capsule->sim_gnsf_phi_fun->casadi_work = &{{ model.name }}_gnsf_phi_fun_work;
-    external_function_param_casadi_create(capsule->sim_gnsf_phi_fun, {{ dims.np }});
+    external_function_param_casadi_create(capsule->sim_gnsf_phi_fun, np);
 
     capsule->sim_gnsf_phi_fun_jac_y->casadi_fun = &{{ model.name }}_gnsf_phi_fun_jac_y;
     capsule->sim_gnsf_phi_fun_jac_y->casadi_n_in = &{{ model.name }}_gnsf_phi_fun_jac_y_n_in;
@@ -183,7 +184,7 @@ int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
     capsule->sim_gnsf_phi_fun_jac_y->casadi_sparsity_in = &{{ model.name }}_gnsf_phi_fun_jac_y_sparsity_in;
     capsule->sim_gnsf_phi_fun_jac_y->casadi_sparsity_out = &{{ model.name }}_gnsf_phi_fun_jac_y_sparsity_out;
     capsule->sim_gnsf_phi_fun_jac_y->casadi_work = &{{ model.name }}_gnsf_phi_fun_jac_y_work;
-    external_function_param_casadi_create(capsule->sim_gnsf_phi_fun_jac_y, {{ dims.np }});
+    external_function_param_casadi_create(capsule->sim_gnsf_phi_fun_jac_y, np);
 
     capsule->sim_gnsf_phi_jac_y_uhat->casadi_fun = &{{ model.name }}_gnsf_phi_jac_y_uhat;
     capsule->sim_gnsf_phi_jac_y_uhat->casadi_n_in = &{{ model.name }}_gnsf_phi_jac_y_uhat_n_in;
@@ -191,7 +192,7 @@ int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
     capsule->sim_gnsf_phi_jac_y_uhat->casadi_sparsity_in = &{{ model.name }}_gnsf_phi_jac_y_uhat_sparsity_in;
     capsule->sim_gnsf_phi_jac_y_uhat->casadi_sparsity_out = &{{ model.name }}_gnsf_phi_jac_y_uhat_sparsity_out;
     capsule->sim_gnsf_phi_jac_y_uhat->casadi_work = &{{ model.name }}_gnsf_phi_jac_y_uhat_work;
-    external_function_param_casadi_create(capsule->sim_gnsf_phi_jac_y_uhat, {{ dims.np }});
+    external_function_param_casadi_create(capsule->sim_gnsf_phi_jac_y_uhat, np);
 
     capsule->sim_gnsf_f_lo_jac_x1_x1dot_u_z->casadi_fun = &{{ model.name }}_gnsf_f_lo_fun_jac_x1k1uz;
     capsule->sim_gnsf_f_lo_jac_x1_x1dot_u_z->casadi_n_in = &{{ model.name }}_gnsf_f_lo_fun_jac_x1k1uz_n_in;
@@ -199,7 +200,7 @@ int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
     capsule->sim_gnsf_f_lo_jac_x1_x1dot_u_z->casadi_sparsity_in = &{{ model.name }}_gnsf_f_lo_fun_jac_x1k1uz_sparsity_in;
     capsule->sim_gnsf_f_lo_jac_x1_x1dot_u_z->casadi_sparsity_out = &{{ model.name }}_gnsf_f_lo_fun_jac_x1k1uz_sparsity_out;
     capsule->sim_gnsf_f_lo_jac_x1_x1dot_u_z->casadi_work = &{{ model.name }}_gnsf_f_lo_fun_jac_x1k1uz_work;
-    external_function_param_casadi_create(capsule->sim_gnsf_f_lo_jac_x1_x1dot_u_z, {{ dims.np }});
+    external_function_param_casadi_create(capsule->sim_gnsf_f_lo_jac_x1_x1dot_u_z, np);
 
     capsule->sim_gnsf_get_matrices_fun->casadi_fun = &{{ model.name }}_gnsf_get_matrices_fun;
     capsule->sim_gnsf_get_matrices_fun->casadi_n_in = &{{ model.name }}_gnsf_get_matrices_fun_n_in;
@@ -207,7 +208,7 @@ int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
     capsule->sim_gnsf_get_matrices_fun->casadi_sparsity_in = &{{ model.name }}_gnsf_get_matrices_fun_sparsity_in;
     capsule->sim_gnsf_get_matrices_fun->casadi_sparsity_out = &{{ model.name }}_gnsf_get_matrices_fun_sparsity_out;
     capsule->sim_gnsf_get_matrices_fun->casadi_work = &{{ model.name }}_gnsf_get_matrices_fun_work;
-    external_function_param_casadi_create(capsule->sim_gnsf_get_matrices_fun, {{ dims.np }});
+    external_function_param_casadi_create(capsule->sim_gnsf_get_matrices_fun, np);
     {% endif %}
 
     // sim plan & config
@@ -243,6 +244,8 @@ int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
     capsule->acados_sim_opts = {{ model.name }}_sim_opts;
     int tmp_int = {{ solver_options.sim_method_newton_iter }};
     sim_opts_set({{ model.name }}_sim_config, {{ model.name }}_sim_opts, "newton_iter", &tmp_int);
+    sim_collocation_type collocation_type = {{ solver_options.collocation_type }};
+    sim_opts_set({{ model.name }}_sim_config, {{ model.name }}_sim_opts, "collocation_type", &collocation_type);
 
 {% if problem_class == "SIM" %}
     tmp_int = {{ solver_options.sim_method_num_stages }};
@@ -321,35 +324,18 @@ int {{ model.name }}_acados_sim_create(sim_solver_capsule * capsule)
                                                {{ model.name }}_sim_dims, {{ model.name }}_sim_opts);
     capsule->acados_sim_solver = {{ model.name }}_sim_solver;
 
+{% if dims.np > 0 %}
     /* initialize parameter values */
-    {% if dims.np > 0 %}
-    // initialize parameters to nominal value
-    double p[{{ dims.np }}];
-    {% for i in range(end=dims.np) %}
-    p[{{ i }}] = {{ parameter_values[i] }};
+    double* p = calloc(np, sizeof(double));
+    {% for item in parameter_values %}
+        {%- if item != 0 %}
+    p[{{ loop.index0 }}] = {{ item }};
+        {%- endif %}
     {%- endfor %}
 
-{%- if solver_options.integrator_type == "ERK" %}
-    capsule->sim_forw_vde_casadi[0].set_param(capsule->sim_forw_vde_casadi, p);
-    capsule->sim_expl_ode_fun_casadi[0].set_param(capsule->sim_expl_ode_fun_casadi, p);
-{%- if hessian_approx == "EXACT" %}
-    capsule->sim_expl_ode_hess[0].set_param(capsule->sim_expl_ode_hess, p);
-{%- endif %}
-{%- elif solver_options.integrator_type == "IRK" %}
-    capsule->sim_impl_dae_fun[0].set_param(capsule->sim_impl_dae_fun, p);
-    capsule->sim_impl_dae_fun_jac_x_xdot_z[0].set_param(capsule->sim_impl_dae_fun_jac_x_xdot_z, p);
-    capsule->sim_impl_dae_jac_x_xdot_u_z[0].set_param(capsule->sim_impl_dae_jac_x_xdot_u_z, p);
-{%- if hessian_approx == "EXACT" %}
-    capsule->sim_impl_dae_hess[0].set_param(capsule->sim_impl_dae_hess, p);
-{%- endif %}
-{%- elif solver_options.integrator_type == "GNSF" %}
-    capsule->sim_gnsf_phi_fun[0].set_param(capsule->sim_gnsf_phi_fun, p);
-    capsule->sim_gnsf_phi_fun_jac_y[0].set_param(capsule->sim_gnsf_phi_fun_jac_y, p);
-    capsule->sim_gnsf_phi_jac_y_uhat[0].set_param(capsule->sim_gnsf_phi_jac_y_uhat, p);
-    capsule->sim_gnsf_f_lo_jac_x1_x1dot_u_z[0].set_param(capsule->sim_gnsf_f_lo_jac_x1_x1dot_u_z, p);
-    capsule->sim_gnsf_get_matrices_fun[0].set_param(capsule->sim_gnsf_get_matrices_fun, p);
-{% endif %}
-    {% endif %}{# if dims.np #}
+    {{ model.name }}_acados_sim_update_params(capsule, p, np);
+    free(p);
+{% endif %}{# if dims.np #}
 
     /* initialize input */
     // x
@@ -437,7 +423,7 @@ int {{ model.name }}_acados_sim_free(sim_solver_capsule *capsule)
 int {{ model.name }}_acados_sim_update_params(sim_solver_capsule *capsule, double *p, int np)
 {
     int status = 0;
-    int casadi_np = {{ dims.np }};
+    int casadi_np = {{ model.name | upper }}_NP;
 
     if (casadi_np != np) {
         printf("{{ model.name }}_acados_sim_update_params: trying to set %i parameters for external functions."
diff --git a/pyextra/acados_template/c_templates_tera/acados_sim_solver.in.h b/pyextra/acados_template/c_templates_tera/acados_sim_solver.in.h
index de332e835..4da8202d3 100644
--- a/pyextra/acados_template/c_templates_tera/acados_sim_solver.in.h
+++ b/pyextra/acados_template/c_templates_tera/acados_sim_solver.in.h
@@ -37,6 +37,11 @@
 #include "acados_c/sim_interface.h"
 #include "acados_c/external_function_interface.h"
 
+#define {{ model.name | upper }}_NX     {{ dims.nx }}
+#define {{ model.name | upper }}_NZ     {{ dims.nz }}
+#define {{ model.name | upper }}_NU     {{ dims.nu }}
+#define {{ model.name | upper }}_NP     {{ dims.np }}
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/pyextra/acados_template/c_templates_tera/acados_solver.in.c b/pyextra/acados_template/c_templates_tera/acados_solver.in.c
index da73f6e29..6e13242a4 100644
--- a/pyextra/acados_template/c_templates_tera/acados_solver.in.c
+++ b/pyextra/acados_template/c_templates_tera/acados_solver.in.c
@@ -71,59 +71,97 @@
 
 #include "acados_solver_{{ model.name }}.h"
 
-#define NX     {{ dims.nx }}
-#define NZ     {{ dims.nz }}
-#define NU     {{ dims.nu }}
-#define NP     {{ dims.np }}
-#define NBX    {{ dims.nbx }}
-#define NBX0   {{ dims.nbx_0 }}
-#define NBU    {{ dims.nbu }}
-#define NSBX   {{ dims.nsbx }}
-#define NSBU   {{ dims.nsbu }}
-#define NSH    {{ dims.nsh }}
-#define NSG    {{ dims.nsg }}
-#define NSPHI  {{ dims.nsphi }}
-#define NSHN   {{ dims.nsh_e }}
-#define NSGN   {{ dims.nsg_e }}
-#define NSPHIN {{ dims.nsphi_e }}
-#define NSBXN  {{ dims.nsbx_e }}
-#define NS     {{ dims.ns }}
-#define NSN    {{ dims.ns_e }}
-#define NG     {{ dims.ng }}
-#define NBXN   {{ dims.nbx_e }}
-#define NGN    {{ dims.ng_e }}
-#define NY0    {{ dims.ny_0 }}
-#define NY     {{ dims.ny }}
-#define NYN    {{ dims.ny_e }}
-#define N      {{ dims.N }}
-#define NH     {{ dims.nh }}
-#define NPHI   {{ dims.nphi }}
-#define NHN    {{ dims.nh_e }}
-#define NPHIN  {{ dims.nphi_e }}
-#define NR     {{ dims.nr }}
+#define NX     {{ model.name | upper }}_NX
+#define NZ     {{ model.name | upper }}_NZ
+#define NU     {{ model.name | upper }}_NU
+#define NP     {{ model.name | upper }}_NP
+#define NBX    {{ model.name | upper }}_NBX
+#define NBX0   {{ model.name | upper }}_NBX0
+#define NBU    {{ model.name | upper }}_NBU
+#define NSBX   {{ model.name | upper }}_NSBX
+#define NSBU   {{ model.name | upper }}_NSBU
+#define NSH    {{ model.name | upper }}_NSH
+#define NSG    {{ model.name | upper }}_NSG
+#define NSPHI  {{ model.name | upper }}_NSPHI
+#define NSHN   {{ model.name | upper }}_NSHN
+#define NSGN   {{ model.name | upper }}_NSGN
+#define NSPHIN {{ model.name | upper }}_NSPHIN
+#define NSBXN  {{ model.name | upper }}_NSBXN
+#define NS     {{ model.name | upper }}_NS
+#define NSN    {{ model.name | upper }}_NSN
+#define NG     {{ model.name | upper }}_NG
+#define NBXN   {{ model.name | upper }}_NBXN
+#define NGN    {{ model.name | upper }}_NGN
+#define NY0    {{ model.name | upper }}_NY0
+#define NY     {{ model.name | upper }}_NY
+#define NYN    {{ model.name | upper }}_NYN
+// #define N      {{ model.name | upper }}_N
+#define NH     {{ model.name | upper }}_NH
+#define NPHI   {{ model.name | upper }}_NPHI
+#define NHN    {{ model.name | upper }}_NHN
+#define NPHIN  {{ model.name | upper }}_NPHIN
+#define NR     {{ model.name | upper }}_NR
 
 
 // ** solver data **
 
-nlp_solver_capsule * {{ model.name }}_acados_create_capsule()
+{{ model.name }}_solver_capsule * {{ model.name }}_acados_create_capsule(void)
 {
-    void* capsule_mem = malloc(sizeof(nlp_solver_capsule));
-    nlp_solver_capsule *capsule = (nlp_solver_capsule *) capsule_mem;
+    void* capsule_mem = malloc(sizeof({{ model.name }}_solver_capsule));
+    {{ model.name }}_solver_capsule *capsule = ({{ model.name }}_solver_capsule *) capsule_mem;
 
     return capsule;
 }
 
 
-int {{ model.name }}_acados_free_capsule(nlp_solver_capsule *capsule)
+int {{ model.name }}_acados_free_capsule({{ model.name }}_solver_capsule *capsule)
 {
     free(capsule);
     return 0;
 }
 
 
-int {{ model.name }}_acados_create(nlp_solver_capsule * capsule)
+int {{ model.name }}_acados_create({{ model.name }}_solver_capsule * capsule)
+{
+    int N_shooting_intervals = {{ model.name | upper }}_N;
+    double* new_time_steps = NULL; // NULL -> don't alter the code generated time-steps
+    return {{ model.name }}_acados_create_with_discretization(capsule, N_shooting_intervals, new_time_steps);
+}
+
+int {{ model.name }}_acados_update_time_steps({{ model.name }}_solver_capsule * capsule, int N, double* new_time_steps)
+{
+    if (N != capsule->nlp_solver_plan->N) {
+        fprintf(stderr, "{{ model.name }}_acados_update_time_steps: given number of time steps (= %d) " \
+            "differs from the currently allocated number of " \
+            "time steps (= %d)!\n" \
+            "Please recreate with new discretization and provide a new vector of time_stamps!\n",
+            N, capsule->nlp_solver_plan->N);
+        return 1;
+    }
+
+    ocp_nlp_config * nlp_config = capsule->nlp_config;
+    ocp_nlp_dims * nlp_dims = capsule->nlp_dims;
+    ocp_nlp_in * nlp_in = capsule->nlp_in;
+
+    for (int i = 0; i < N; i++)
+    {
+        ocp_nlp_in_set(nlp_config, nlp_dims, nlp_in, i, "Ts", &new_time_steps[i]);
+        ocp_nlp_cost_model_set(nlp_config, nlp_dims, nlp_in, i, "scaling", &new_time_steps[i]);
+    }
+    return 0;
+}
+
+int {{ model.name }}_acados_create_with_discretization({{ model.name }}_solver_capsule * capsule, int N, double* new_time_steps)
 {
     int status = 0;
+    // If N does not match the number of shooting intervals used for code generation, new_time_steps must be given.
+    if (N != {{ model.name | upper }}_N && !new_time_steps) {
+        fprintf(stderr, "{{ model.name }}_acados_create_with_discretization: new_time_steps is NULL " \
+            "but the number of shooting intervals (= %d) differs from the number of " \
+            "shooting intervals (= %d) during code generation! Please provide a new vector of time_stamps!\n", \
+             N, {{ model.name | upper }}_N);
+        return 1;
+    }
 
     // number of expected runtime parameters
     capsule->nlp_np = NP;
@@ -895,28 +933,27 @@ int {{ model.name }}_acados_create(nlp_solver_capsule * capsule)
         {%- endif %}
     {%- endfor %}
 
+    if (new_time_steps) {
+        {{ model.name }}_acados_update_time_steps(capsule, N, new_time_steps);
+    } else {
     {%- if all_equal == true -%}
-    // all time_steps are identical
-    double time_step = {{ solver_options.time_steps[0] }};
-    for (int i = 0; i < N; i++)
-    {
-        ocp_nlp_in_set(nlp_config, nlp_dims, nlp_in, i, "Ts", &time_step);
-        ocp_nlp_cost_model_set(nlp_config, nlp_dims, nlp_in, i, "scaling", &time_step);
-    }
+        // all time_steps are identical
+        double time_step = {{ solver_options.time_steps[0] }};
+        for (int i = 0; i < N; i++)
+        {
+            ocp_nlp_in_set(nlp_config, nlp_dims, nlp_in, i, "Ts", &time_step);
+            ocp_nlp_cost_model_set(nlp_config, nlp_dims, nlp_in, i, "scaling", &time_step);
+        }
     {%- else -%}
-    // time_steps are different
-    double* time_steps = malloc(N*sizeof(double));
-    {%- for j in range(end=dims.N) %}
-    time_steps[{{ j }}] = {{ solver_options.time_steps[j] }};
-    {%- endfor %}
-
-    for (int i = 0; i < N; i++)
-    {
-        ocp_nlp_in_set(nlp_config, nlp_dims, nlp_in, i, "Ts", &time_steps[i]);
-        ocp_nlp_cost_model_set(nlp_config, nlp_dims, nlp_in, i, "scaling", &time_steps[i]);
-    }
-    free(time_steps);
+        // time_steps are different
+        double* time_steps = malloc(N*sizeof(double));
+        {%- for j in range(end=dims.N) %}
+        time_steps[{{ j }}] = {{ solver_options.time_steps[j] }};
+        {%- endfor %}
+        {{ model.name }}_acados_update_time_steps(capsule, N, time_steps);
+        free(time_steps);
     {%- endif %}
+    }
 
     /**** Dynamics ****/
     for (int i = 0; i < N; i++)
@@ -1879,6 +1916,11 @@ int {{ model.name }}_acados_create(nlp_solver_capsule * capsule)
 
 {%- if solver_options.integrator_type != "DISCRETE" %}
 
+    // set collocation type (relevant for implicit integrators)
+    sim_collocation_type collocation_type = {{ solver_options.collocation_type }};
+    for (int i = 0; i < N; i++)
+        ocp_nlp_solver_opts_set_at_stage(nlp_config, capsule->nlp_opts, i, "dynamics_collocation_type", &collocation_type);
+
     // set up sim_method_num_steps
     {%- set all_equal = true %}
     {%- set val = solver_options.sim_method_num_steps[0] %}
@@ -2113,7 +2155,7 @@ int {{ model.name }}_acados_create(nlp_solver_capsule * capsule)
 }
 
 
-int {{ model.name }}_acados_update_params(nlp_solver_capsule * capsule, int stage, double *p, int np)
+int {{ model.name }}_acados_update_params({{ model.name }}_solver_capsule * capsule, int stage, double *p, int np)
 {
     int solver_status = 0;
 
@@ -2125,7 +2167,8 @@ int {{ model.name }}_acados_update_params(nlp_solver_capsule * capsule, int stag
     }
 
 {%- if dims.np > 0 %}
-    if (stage < {{ dims.N }} && stage >= 0)
+    const int N = capsule->nlp_solver_plan->N;
+    if (stage < N && stage >= 0)
     {
     {%- if solver_options.integrator_type == "IRK" %}
         capsule->impl_dae_fun[stage].set_param(capsule->impl_dae_fun+stage, p);
@@ -2228,7 +2271,7 @@ int {{ model.name }}_acados_update_params(nlp_solver_capsule * capsule, int stag
 
 
 
-int {{ model.name }}_acados_solve(nlp_solver_capsule * capsule)
+int {{ model.name }}_acados_solve({{ model.name }}_solver_capsule * capsule)
 {
     // solve NLP 
     int solver_status = ocp_nlp_solve(capsule->nlp_solver, capsule->nlp_in, capsule->nlp_out);
@@ -2237,8 +2280,10 @@ int {{ model.name }}_acados_solve(nlp_solver_capsule * capsule)
 }
 
 
-int {{ model.name }}_acados_free(nlp_solver_capsule * capsule)
+int {{ model.name }}_acados_free({{ model.name }}_solver_capsule * capsule)
 {
+    // before destroying, keep some info
+    const int N = capsule->nlp_solver_plan->N;
     // free memory
     ocp_nlp_solver_opts_destroy(capsule->nlp_opts);
     ocp_nlp_in_destroy(capsule->nlp_in);
@@ -2251,7 +2296,7 @@ int {{ model.name }}_acados_free(nlp_solver_capsule * capsule)
     /* free external function */
     // dynamics
 {%- if solver_options.integrator_type == "IRK" %}
-    for (int i = 0; i < {{ dims.N }}; i++)
+    for (int i = 0; i < N; i++)
     {
         external_function_param_casadi_free(&capsule->impl_dae_fun[i]);
         external_function_param_casadi_free(&capsule->impl_dae_fun_jac_x_xdot_z[i]);
@@ -2268,7 +2313,7 @@ int {{ model.name }}_acados_free(nlp_solver_capsule * capsule)
     {%- endif %}
 
 {%- elif solver_options.integrator_type == "LIFTED_IRK" %}
-    for (int i = 0; i < {{ dims.N }}; i++)
+    for (int i = 0; i < N; i++)
     {
         external_function_param_casadi_free(&capsule->impl_dae_fun[i]);
         external_function_param_casadi_free(&capsule->impl_dae_fun_jac_x_xdot_u[i]);
@@ -2277,7 +2322,7 @@ int {{ model.name }}_acados_free(nlp_solver_capsule * capsule)
     free(capsule->impl_dae_fun_jac_x_xdot_u);
 
 {%- elif solver_options.integrator_type == "ERK" %}
-    for (int i = 0; i < {{ dims.N }}; i++)
+    for (int i = 0; i < N; i++)
     {
         external_function_param_casadi_free(&capsule->forw_vde_casadi[i]);
         external_function_param_casadi_free(&capsule->expl_ode_fun[i]);
@@ -2292,7 +2337,7 @@ int {{ model.name }}_acados_free(nlp_solver_capsule * capsule)
     {%- endif %}
 
 {%- elif solver_options.integrator_type == "GNSF" %}
-    for (int i = 0; i < {{ dims.N }}; i++)
+    for (int i = 0; i < N; i++)
     {
         external_function_param_casadi_free(&capsule->gnsf_phi_fun[i]);
         external_function_param_casadi_free(&capsule->gnsf_phi_fun_jac_y[i]);
@@ -2306,7 +2351,7 @@ int {{ model.name }}_acados_free(nlp_solver_capsule * capsule)
     free(capsule->gnsf_f_lo_jac_x1_x1dot_u_z);
     free(capsule->gnsf_get_matrices_fun);
 {%- elif solver_options.integrator_type == "DISCRETE" %}
-    for (int i = 0; i < {{ dims.N }}; i++)
+    for (int i = 0; i < N; i++)
     {
         external_function_param_{{ model.dyn_ext_fun_type }}_free(&capsule->discr_dyn_phi_fun[i]);
         external_function_param_{{ model.dyn_ext_fun_type }}_free(&capsule->discr_dyn_phi_fun_jac_ut_xt[i]);
@@ -2333,7 +2378,7 @@ int {{ model.name }}_acados_free(nlp_solver_capsule * capsule)
     external_function_param_{{ cost.cost_ext_fun_type_0 }}_free(&capsule->ext_cost_0_fun_jac_hess);
 {%- endif %}
 {%- if cost.cost_type == "NONLINEAR_LS" %}
-    for (int i = 0; i < {{ dims.N }} - 1; i++)
+    for (int i = 0; i < N - 1; i++)
     {
         external_function_param_casadi_free(&capsule->cost_y_fun[i]);
         external_function_param_casadi_free(&capsule->cost_y_fun_jac_ut_xt[i]);
@@ -2343,7 +2388,7 @@ int {{ model.name }}_acados_free(nlp_solver_capsule * capsule)
     free(capsule->cost_y_fun_jac_ut_xt);
     free(capsule->cost_y_hess);
 {%- elif cost.cost_type == "EXTERNAL" %}
-    for (int i = 0; i < {{ dims.N }} - 1; i++)
+    for (int i = 0; i < N - 1; i++)
     {
         external_function_param_{{ cost.cost_ext_fun_type }}_free(&capsule->ext_cost_fun[i]);
         external_function_param_{{ cost.cost_ext_fun_type }}_free(&capsule->ext_cost_fun_jac[i]);
@@ -2365,13 +2410,13 @@ int {{ model.name }}_acados_free(nlp_solver_capsule * capsule)
 
     // constraints
 {%- if constraints.constr_type == "BGH" and dims.nh > 0 %}
-    for (int i = 0; i < {{ dims.N }}; i++)
+    for (int i = 0; i < N; i++)
     {
         external_function_param_casadi_free(&capsule->nl_constr_h_fun_jac[i]);
         external_function_param_casadi_free(&capsule->nl_constr_h_fun[i]);
     }
   {%- if solver_options.hessian_approx == "EXACT" %}
-    for (int i = 0; i < {{ dims.N }}; i++)
+    for (int i = 0; i < N; i++)
     {
         external_function_param_casadi_free(&capsule->nl_constr_h_fun_jac_hess[i]);
     }
@@ -2383,7 +2428,7 @@ int {{ model.name }}_acados_free(nlp_solver_capsule * capsule)
   {%- endif %}
 
 {%- elif constraints.constr_type == "BGP" and dims.nphi > 0 %}
-    for (int i = 0; i < {{ dims.N }}; i++)
+    for (int i = 0; i < N; i++)
     {
         external_function_param_casadi_free(&capsule->phi_constraint[i]);
     }
@@ -2403,16 +2448,16 @@ int {{ model.name }}_acados_free(nlp_solver_capsule * capsule)
     return 0;
 }
 
-ocp_nlp_in *{{ model.name }}_acados_get_nlp_in(nlp_solver_capsule * capsule) { return capsule->nlp_in; }
-ocp_nlp_out *{{ model.name }}_acados_get_nlp_out(nlp_solver_capsule * capsule) { return capsule->nlp_out; }
-ocp_nlp_solver *{{ model.name }}_acados_get_nlp_solver(nlp_solver_capsule * capsule) { return capsule->nlp_solver; }
-ocp_nlp_config *{{ model.name }}_acados_get_nlp_config(nlp_solver_capsule * capsule) { return capsule->nlp_config; }
-void *{{ model.name }}_acados_get_nlp_opts(nlp_solver_capsule * capsule) { return capsule->nlp_opts; }
-ocp_nlp_dims *{{ model.name }}_acados_get_nlp_dims(nlp_solver_capsule * capsule) { return capsule->nlp_dims; }
-ocp_nlp_plan *{{ model.name }}_acados_get_nlp_plan(nlp_solver_capsule * capsule) { return capsule->nlp_solver_plan; }
+ocp_nlp_in *{{ model.name }}_acados_get_nlp_in({{ model.name }}_solver_capsule * capsule) { return capsule->nlp_in; }
+ocp_nlp_out *{{ model.name }}_acados_get_nlp_out({{ model.name }}_solver_capsule * capsule) { return capsule->nlp_out; }
+ocp_nlp_solver *{{ model.name }}_acados_get_nlp_solver({{ model.name }}_solver_capsule * capsule) { return capsule->nlp_solver; }
+ocp_nlp_config *{{ model.name }}_acados_get_nlp_config({{ model.name }}_solver_capsule * capsule) { return capsule->nlp_config; }
+void *{{ model.name }}_acados_get_nlp_opts({{ model.name }}_solver_capsule * capsule) { return capsule->nlp_opts; }
+ocp_nlp_dims *{{ model.name }}_acados_get_nlp_dims({{ model.name }}_solver_capsule * capsule) { return capsule->nlp_dims; }
+ocp_nlp_plan *{{ model.name }}_acados_get_nlp_plan({{ model.name }}_solver_capsule * capsule) { return capsule->nlp_solver_plan; }
 
 
-void {{ model.name }}_acados_print_stats(nlp_solver_capsule * capsule)
+void {{ model.name }}_acados_print_stats({{ model.name }}_solver_capsule * capsule)
 {
     int sqp_iter, stat_m, stat_n, tmp_int;
     ocp_nlp_get(capsule->nlp_config, capsule->nlp_solver, "sqp_iter", &sqp_iter);
diff --git a/pyextra/acados_template/c_templates_tera/acados_solver.in.h b/pyextra/acados_template/c_templates_tera/acados_solver.in.h
index e8966c254..e8c0a38ca 100644
--- a/pyextra/acados_template/c_templates_tera/acados_solver.in.h
+++ b/pyextra/acados_template/c_templates_tera/acados_solver.in.h
@@ -37,12 +37,43 @@
 #include "acados_c/ocp_nlp_interface.h"
 #include "acados_c/external_function_interface.h"
 
+#define {{ model.name | upper }}_NX     {{ dims.nx }}
+#define {{ model.name | upper }}_NZ     {{ dims.nz }}
+#define {{ model.name | upper }}_NU     {{ dims.nu }}
+#define {{ model.name | upper }}_NP     {{ dims.np }}
+#define {{ model.name | upper }}_NBX    {{ dims.nbx }}
+#define {{ model.name | upper }}_NBX0   {{ dims.nbx_0 }}
+#define {{ model.name | upper }}_NBU    {{ dims.nbu }}
+#define {{ model.name | upper }}_NSBX   {{ dims.nsbx }}
+#define {{ model.name | upper }}_NSBU   {{ dims.nsbu }}
+#define {{ model.name | upper }}_NSH    {{ dims.nsh }}
+#define {{ model.name | upper }}_NSG    {{ dims.nsg }}
+#define {{ model.name | upper }}_NSPHI  {{ dims.nsphi }}
+#define {{ model.name | upper }}_NSHN   {{ dims.nsh_e }}
+#define {{ model.name | upper }}_NSGN   {{ dims.nsg_e }}
+#define {{ model.name | upper }}_NSPHIN {{ dims.nsphi_e }}
+#define {{ model.name | upper }}_NSBXN  {{ dims.nsbx_e }}
+#define {{ model.name | upper }}_NS     {{ dims.ns }}
+#define {{ model.name | upper }}_NSN    {{ dims.ns_e }}
+#define {{ model.name | upper }}_NG     {{ dims.ng }}
+#define {{ model.name | upper }}_NBXN   {{ dims.nbx_e }}
+#define {{ model.name | upper }}_NGN    {{ dims.ng_e }}
+#define {{ model.name | upper }}_NY0    {{ dims.ny_0 }}
+#define {{ model.name | upper }}_NY     {{ dims.ny }}
+#define {{ model.name | upper }}_NYN    {{ dims.ny_e }}
+#define {{ model.name | upper }}_N      {{ dims.N }}
+#define {{ model.name | upper }}_NH     {{ dims.nh }}
+#define {{ model.name | upper }}_NPHI   {{ dims.nphi }}
+#define {{ model.name | upper }}_NHN    {{ dims.nh_e }}
+#define {{ model.name | upper }}_NPHIN  {{ dims.nphi_e }}
+#define {{ model.name | upper }}_NR     {{ dims.nr }}
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 // ** capsule for solver data **
-typedef struct nlp_solver_capsule
+typedef struct {{ model.name }}_solver_capsule
 {
     // acados objects
     ocp_nlp_in *nlp_in;
@@ -58,73 +89,115 @@ typedef struct nlp_solver_capsule
 
     /* external functions */
     // dynamics
+{% if solver_options.integrator_type == "ERK" %}
     external_function_param_casadi *forw_vde_casadi;
     external_function_param_casadi *expl_ode_fun;
+{% if solver_options.hessian_approx == "EXACT" %}
     external_function_param_casadi *hess_vde_casadi;
+{%- endif %}
+{% elif solver_options.integrator_type == "IRK" %}
     external_function_param_casadi *impl_dae_fun;
     external_function_param_casadi *impl_dae_fun_jac_x_xdot_z;
     external_function_param_casadi *impl_dae_jac_x_xdot_u_z;
-    external_function_param_casadi *impl_dae_fun_jac_x_xdot_u;
+{% if solver_options.hessian_approx == "EXACT" %}
     external_function_param_casadi *impl_dae_hess;
+{%- endif %}
+{% elif solver_options.integrator_type == "LIFTED_IRK" %}
+    external_function_param_casadi *impl_dae_fun;
+    external_function_param_casadi *impl_dae_fun_jac_x_xdot_u;
+{% elif solver_options.integrator_type == "GNSF" %}
     external_function_param_casadi *gnsf_phi_fun;
     external_function_param_casadi *gnsf_phi_fun_jac_y;
     external_function_param_casadi *gnsf_phi_jac_y_uhat;
     external_function_param_casadi *gnsf_f_lo_jac_x1_x1dot_u_z;
     external_function_param_casadi *gnsf_get_matrices_fun;
+{% elif solver_options.integrator_type == "DISCRETE" %}
     external_function_param_{{ model.dyn_ext_fun_type }} *discr_dyn_phi_fun;
     external_function_param_{{ model.dyn_ext_fun_type }} *discr_dyn_phi_fun_jac_ut_xt;
+{%- if solver_options.hessian_approx == "EXACT" %}
     external_function_param_{{ model.dyn_ext_fun_type }} *discr_dyn_phi_fun_jac_ut_xt_hess;
+{%- endif %}
+{%- endif %}
+
 
     // cost
+{% if cost.cost_type == "NONLINEAR_LS" %}
     external_function_param_casadi *cost_y_fun;
     external_function_param_casadi *cost_y_fun_jac_ut_xt;
     external_function_param_casadi *cost_y_hess;
+{%- elif cost.cost_type == "EXTERNAL" %}
     external_function_param_{{ cost.cost_ext_fun_type }} *ext_cost_fun;
     external_function_param_{{ cost.cost_ext_fun_type }} *ext_cost_fun_jac;
     external_function_param_{{ cost.cost_ext_fun_type }} *ext_cost_fun_jac_hess;
+{% endif %}
 
+{% if cost.cost_type_0 == "NONLINEAR_LS" %}
     external_function_param_casadi cost_y_0_fun;
     external_function_param_casadi cost_y_0_fun_jac_ut_xt;
     external_function_param_casadi cost_y_0_hess;
+{% elif cost.cost_type_0 == "EXTERNAL" %}
     external_function_param_{{ cost.cost_ext_fun_type_0 }} ext_cost_0_fun;
     external_function_param_{{ cost.cost_ext_fun_type_0 }} ext_cost_0_fun_jac;
     external_function_param_{{ cost.cost_ext_fun_type_0 }} ext_cost_0_fun_jac_hess;
+{%- endif %}
 
+{% if cost.cost_type_e == "NONLINEAR_LS" %}
     external_function_param_casadi cost_y_e_fun;
     external_function_param_casadi cost_y_e_fun_jac_ut_xt;
     external_function_param_casadi cost_y_e_hess;
+{% elif cost.cost_type_e == "EXTERNAL" %}
     external_function_param_{{ cost.cost_ext_fun_type_e }} ext_cost_e_fun;
     external_function_param_{{ cost.cost_ext_fun_type_e }} ext_cost_e_fun_jac;
     external_function_param_{{ cost.cost_ext_fun_type_e }} ext_cost_e_fun_jac_hess;
+{%- endif %}
 
     // constraints
+{%- if constraints.constr_type == "BGP" %}
     external_function_param_casadi *phi_constraint;
+{% elif constraints.constr_type == "BGH" and dims.nh > 0 %}
     external_function_param_casadi *nl_constr_h_fun_jac;
     external_function_param_casadi *nl_constr_h_fun;
     external_function_param_casadi *nl_constr_h_fun_jac_hess;
+{%- endif %}
 
+
+{% if constraints.constr_type_e == "BGP" %}
     external_function_param_casadi phi_e_constraint;
+{% elif constraints.constr_type_e == "BGH" and dims.nh_e > 0 %}
     external_function_param_casadi nl_constr_h_e_fun_jac;
     external_function_param_casadi nl_constr_h_e_fun;
     external_function_param_casadi nl_constr_h_e_fun_jac_hess;
-} nlp_solver_capsule;
+{%- endif %}
 
-nlp_solver_capsule * {{ model.name }}_acados_create_capsule(void);
-int {{ model.name }}_acados_free_capsule(nlp_solver_capsule *capsule);
+} {{ model.name }}_solver_capsule;
 
-int {{ model.name }}_acados_create(nlp_solver_capsule * capsule);
-int {{ model.name }}_acados_update_params(nlp_solver_capsule * capsule, int stage, double *value, int np);
-int {{ model.name }}_acados_solve(nlp_solver_capsule * capsule);
-int {{ model.name }}_acados_free(nlp_solver_capsule * capsule);
-void {{ model.name }}_acados_print_stats(nlp_solver_capsule * capsule);
+{{ model.name }}_solver_capsule * {{ model.name }}_acados_create_capsule(void);
+int {{ model.name }}_acados_free_capsule({{ model.name }}_solver_capsule *capsule);
 
-ocp_nlp_in *{{ model.name }}_acados_get_nlp_in(nlp_solver_capsule * capsule);
-ocp_nlp_out *{{ model.name }}_acados_get_nlp_out(nlp_solver_capsule * capsule);
-ocp_nlp_solver *{{ model.name }}_acados_get_nlp_solver(nlp_solver_capsule * capsule);
-ocp_nlp_config *{{ model.name }}_acados_get_nlp_config(nlp_solver_capsule * capsule);
-void *{{ model.name }}_acados_get_nlp_opts(nlp_solver_capsule * capsule);
-ocp_nlp_dims *{{ model.name }}_acados_get_nlp_dims(nlp_solver_capsule * capsule);
-ocp_nlp_plan *{{ model.name }}_acados_get_nlp_plan(nlp_solver_capsule * capsule);
+int {{ model.name }}_acados_create({{ model.name }}_solver_capsule * capsule);
+/**
+ * Generic version of {{ model.name }}_acados_create which allows to use a different number of shooting intervals than
+ * the number used for code generation. If new_time_steps=NULL and n_time_steps matches the number used for code
+ * generation, the time-steps from code generation is used.
+ */
+int {{ model.name }}_acados_create_with_discretization({{ model.name }}_solver_capsule * capsule, int n_time_steps, double* new_time_steps);
+/**
+ * Update the time step vector. Number N must be identical to the currently set number of shooting nodes in the
+ * nlp_solver_plan. Returns 0 if no error occurred and a otherwise a value other than 0.
+ */
+int {{ model.name }}_acados_update_time_steps({{ model.name }}_solver_capsule * capsule, int N, double* new_time_steps);
+int {{ model.name }}_acados_update_params({{ model.name }}_solver_capsule * capsule, int stage, double *value, int np);
+int {{ model.name }}_acados_solve({{ model.name }}_solver_capsule * capsule);
+int {{ model.name }}_acados_free({{ model.name }}_solver_capsule * capsule);
+void {{ model.name }}_acados_print_stats({{ model.name }}_solver_capsule * capsule);
+
+ocp_nlp_in *{{ model.name }}_acados_get_nlp_in({{ model.name }}_solver_capsule * capsule);
+ocp_nlp_out *{{ model.name }}_acados_get_nlp_out({{ model.name }}_solver_capsule * capsule);
+ocp_nlp_solver *{{ model.name }}_acados_get_nlp_solver({{ model.name }}_solver_capsule * capsule);
+ocp_nlp_config *{{ model.name }}_acados_get_nlp_config({{ model.name }}_solver_capsule * capsule);
+void *{{ model.name }}_acados_get_nlp_opts({{ model.name }}_solver_capsule * capsule);
+ocp_nlp_dims *{{ model.name }}_acados_get_nlp_dims({{ model.name }}_solver_capsule * capsule);
+ocp_nlp_plan *{{ model.name }}_acados_get_nlp_plan({{ model.name }}_solver_capsule * capsule);
 
 #ifdef __cplusplus
 } /* extern "C" */
diff --git a/pyextra/acados_template/c_templates_tera/acados_solver.in.pxd b/pyextra/acados_template/c_templates_tera/acados_solver.in.pxd
new file mode 100644
index 000000000..8387980c2
--- /dev/null
+++ b/pyextra/acados_template/c_templates_tera/acados_solver.in.pxd
@@ -0,0 +1,22 @@
+cimport acados_solver_common
+
+cdef extern from "acados_solver_{{ model.name }}.h":
+    ctypedef struct nlp_solver_capsule "{{ model.name }}_solver_capsule":
+        pass
+
+    nlp_solver_capsule * acados_create_capsule "{{ model.name }}_acados_create_capsule"()
+    int acados_free_capsule "{{ model.name }}_acados_free_capsule"(nlp_solver_capsule *capsule)
+
+    int acados_create "{{ model.name }}_acados_create"(nlp_solver_capsule * capsule)
+    int acados_update_params "{{ model.name }}_acados_update_params"(nlp_solver_capsule * capsule, int stage, double *value, int np_)
+    int acados_solve "{{ model.name }}_acados_solve"(nlp_solver_capsule * capsule)
+    int acados_free "{{ model.name }}_acados_free"(nlp_solver_capsule * capsule)
+    void acados_print_stats "{{ model.name }}_acados_print_stats"(nlp_solver_capsule * capsule)
+
+    acados_solver_common.ocp_nlp_in *acados_get_nlp_in "{{ model.name }}_acados_get_nlp_in"(nlp_solver_capsule * capsule)
+    acados_solver_common.ocp_nlp_out *acados_get_nlp_out "{{ model.name }}_acados_get_nlp_out"(nlp_solver_capsule * capsule)
+    acados_solver_common.ocp_nlp_solver *acados_get_nlp_solver "{{ model.name }}_acados_get_nlp_solver"(nlp_solver_capsule * capsule)
+    acados_solver_common.ocp_nlp_config *acados_get_nlp_config "{{ model.name }}_acados_get_nlp_config"(nlp_solver_capsule * capsule)
+    void *acados_get_nlp_opts "{{ model.name }}_acados_get_nlp_opts"(nlp_solver_capsule * capsule)
+    acados_solver_common.ocp_nlp_dims *acados_get_nlp_dims "{{ model.name }}_acados_get_nlp_dims"(nlp_solver_capsule * capsule)
+    acados_solver_common.ocp_nlp_plan *acados_get_nlp_plan "{{ model.name }}_acados_get_nlp_plan"(nlp_solver_capsule * capsule)
diff --git a/pyextra/acados_template/c_templates_tera/acados_solver_sfun.in.c b/pyextra/acados_template/c_templates_tera/acados_solver_sfun.in.c
index e70c5cafe..a6cd1faa8 100644
--- a/pyextra/acados_template/c_templates_tera/acados_solver_sfun.in.c
+++ b/pyextra/acados_template/c_templates_tera/acados_solver_sfun.in.c
@@ -377,7 +377,7 @@ static void mdlInitializeSampleTimes(SimStruct *S)
 
 static void mdlStart(SimStruct *S)
 {
-    nlp_solver_capsule *capsule = {{ model.name }}_acados_create_capsule();
+    {{ model.name }}_solver_capsule *capsule = {{ model.name }}_acados_create_capsule();
     {{ model.name }}_acados_create(capsule);
 
     ssSetUserData(S, (void*)capsule);
@@ -386,7 +386,7 @@ static void mdlStart(SimStruct *S)
 
 static void mdlOutputs(SimStruct *S, int_T tid)
 {
-    nlp_solver_capsule *capsule = ssGetUserData(S);
+    {{ model.name }}_solver_capsule *capsule = ssGetUserData(S);
     ocp_nlp_config *nlp_config = {{ model.name }}_acados_get_nlp_config(capsule);
     ocp_nlp_dims *nlp_dims = {{ model.name }}_acados_get_nlp_dims(capsule);
     ocp_nlp_in *nlp_in = {{ model.name }}_acados_get_nlp_in(capsule);
@@ -747,7 +747,7 @@ static void mdlOutputs(SimStruct *S, int_T tid)
 
 static void mdlTerminate(SimStruct *S)
 {
-    nlp_solver_capsule *capsule = ssGetUserData(S);
+    {{ model.name }}_solver_capsule *capsule = ssGetUserData(S);
 
     {{ model.name }}_acados_free(capsule);
     {{ model.name }}_acados_free_capsule(capsule);
diff --git a/pyextra/acados_template/c_templates_tera/main.in.c b/pyextra/acados_template/c_templates_tera/main.in.c
index 1cae69aea..3348eea5c 100644
--- a/pyextra/acados_template/c_templates_tera/main.in.c
+++ b/pyextra/acados_template/c_templates_tera/main.in.c
@@ -42,12 +42,46 @@
 #include "acados_c/external_function_interface.h"
 #include "acados_solver_{{ model.name }}.h"
 
+#define NX     {{ model.name | upper }}_NX
+#define NZ     {{ model.name | upper }}_NZ
+#define NU     {{ model.name | upper }}_NU
+#define NP     {{ model.name | upper }}_NP
+#define NBX    {{ model.name | upper }}_NBX
+#define NBX0   {{ model.name | upper }}_NBX0
+#define NBU    {{ model.name | upper }}_NBU
+#define NSBX   {{ model.name | upper }}_NSBX
+#define NSBU   {{ model.name | upper }}_NSBU
+#define NSH    {{ model.name | upper }}_NSH
+#define NSG    {{ model.name | upper }}_NSG
+#define NSPHI  {{ model.name | upper }}_NSPHI
+#define NSHN   {{ model.name | upper }}_NSHN
+#define NSGN   {{ model.name | upper }}_NSGN
+#define NSPHIN {{ model.name | upper }}_NSPHIN
+#define NSBXN  {{ model.name | upper }}_NSBXN
+#define NS     {{ model.name | upper }}_NS
+#define NSN    {{ model.name | upper }}_NSN
+#define NG     {{ model.name | upper }}_NG
+#define NBXN   {{ model.name | upper }}_NBXN
+#define NGN    {{ model.name | upper }}_NGN
+#define NY0    {{ model.name | upper }}_NY0
+#define NY     {{ model.name | upper }}_NY
+#define NYN    {{ model.name | upper }}_NYN
+#define NH     {{ model.name | upper }}_NH
+#define NPHI   {{ model.name | upper }}_NPHI
+#define NHN    {{ model.name | upper }}_NHN
+#define NPHIN  {{ model.name | upper }}_NPHIN
+#define NR     {{ model.name | upper }}_NR
+
 
 int main()
 {
 
-    nlp_solver_capsule *acados_ocp_capsule = {{ model.name }}_acados_create_capsule();
-    int status = {{ model.name }}_acados_create(acados_ocp_capsule);
+    {{ model.name }}_solver_capsule *acados_ocp_capsule = {{ model.name }}_acados_create_capsule();
+    // there is an opportunity to change the number of shooting intervals in C without new code generation
+    int N = {{ model.name | upper }}_N;
+    // allocate the array and fill it accordingly
+    double* new_time_steps = NULL;
+    int status = {{ model.name }}_acados_create_with_discretization(acados_ocp_capsule, N, new_time_steps);
 
     if (status)
     {
@@ -63,13 +97,13 @@ int main()
     void *nlp_opts = {{ model.name }}_acados_get_nlp_opts(acados_ocp_capsule);
 
     // initial condition
-    int idxbx0[{{ dims.nbx_0 }}];
+    int idxbx0[NBX0];
     {%- for i in range(end=dims.nbx_0) %}
     idxbx0[{{ i }}] = {{ constraints.idxbx_0[i] }};
     {%- endfor %}
 
-    double lbx0[{{ dims.nbx_0 }}];
-    double ubx0[{{ dims.nbx_0 }}];
+    double lbx0[NBX0];
+    double ubx0[NBX0];
     {%- for i in range(end=dims.nbx_0) %}
     lbx0[{{ i }}] = {{ constraints.lbx_0[i] }};
     ubx0[{{ i }}] = {{ constraints.ubx_0[i] }};
@@ -80,13 +114,13 @@ int main()
     ocp_nlp_constraints_model_set(nlp_config, nlp_dims, nlp_in, 0, "ubx", ubx0);
 
     // initialization for state values
-    double x_init[{{ dims.nx }}];
+    double x_init[NX];
     {%- for i in range(end=dims.nx) %}
     x_init[{{ i }}] = 0.0;
     {%- endfor %}
 
     // initial value for control input
-    double u0[{{ dims.nu }}];
+    double u0[NU];
     {%- for i in range(end=dims.nu) %}
     u0[{{ i }}] = 0.0;
     {%- endfor %}
@@ -94,14 +128,14 @@ int main()
 
   {%- if dims.np > 0 %}
     // set parameters
-    double p[{{ dims.np }}];
-    {% for item in parameter_values %}
+    double p[NP];
+    {%- for item in parameter_values %}
     p[{{ loop.index0 }}] = {{ item }};
-    {% endfor %}
+    {%- endfor %}
 
-    for (int ii = 0; ii <= {{ dims.N }}; ii++)
+    for (int ii = 0; ii <= N; ii++)
     {
-        {{ model.name }}_acados_update_params(acados_ocp_capsule, ii, p, {{ dims.np }});
+        {{ model.name }}_acados_update_params(acados_ocp_capsule, ii, p, NP);
     }
   {% endif %}{# if np > 0 #}
 
@@ -112,8 +146,8 @@ int main()
     double elapsed_time;
     int sqp_iter;
 
-    double xtraj[{{ dims.nx }} * ({{ dims.N }}+1)];
-    double utraj[{{ dims.nu }} * ({{ dims.N }})];
+    double xtraj[NX * (N+1)];
+    double utraj[NU * N];
 
 
     // solve ocp in loop
@@ -135,14 +169,14 @@ int main()
 
     /* print solution and statistics */
     for (int ii = 0; ii <= nlp_dims->N; ii++)
-        ocp_nlp_out_get(nlp_config, nlp_dims, nlp_out, ii, "x", &xtraj[ii*{{ dims.nx }}]);
+        ocp_nlp_out_get(nlp_config, nlp_dims, nlp_out, ii, "x", &xtraj[ii*NX]);
     for (int ii = 0; ii < nlp_dims->N; ii++)
-        ocp_nlp_out_get(nlp_config, nlp_dims, nlp_out, ii, "u", &utraj[ii*{{ dims.nu }}]);
+        ocp_nlp_out_get(nlp_config, nlp_dims, nlp_out, ii, "u", &utraj[ii*NU]);
 
     printf("\n--- xtraj ---\n");
-    d_print_exp_tran_mat( {{ dims.nx }}, {{ dims.N }}+1, xtraj, {{ dims.nx }} );
+    d_print_exp_tran_mat( NX, N+1, xtraj, NX);
     printf("\n--- utraj ---\n");
-    d_print_exp_tran_mat( {{ dims.nu }}, {{ dims.N }}, utraj, {{ dims.nu }} );
+    d_print_exp_tran_mat( NU, N, utraj, NU );
     // ocp_nlp_out_print(nlp_solver->dims, nlp_out);
 
     printf("\nsolved ocp %d times, solution printed above\n\n", NTIMINGS);
diff --git a/pyextra/acados_template/c_templates_tera/main_sim.in.c b/pyextra/acados_template/c_templates_tera/main_sim.in.c
index b62bf8f7f..743e81d59 100644
--- a/pyextra/acados_template/c_templates_tera/main_sim.in.c
+++ b/pyextra/acados_template/c_templates_tera/main_sim.in.c
@@ -41,6 +41,11 @@
 #include "acados_c/sim_interface.h"
 #include "acados_sim_solver_{{ model.name }}.h"
 
+#define NX     {{ model.name | upper }}_NX
+#define NZ     {{ model.name | upper }}_NZ
+#define NU     {{ model.name | upper }}_NU
+#define NP     {{ model.name | upper }}_NP
+
 
 int main()
 {
@@ -60,7 +65,7 @@ int main()
     void *acados_sim_dims = {{ model.name }}_acados_get_sim_dims(capsule);
 
     // initial condition
-    double x_current[{{ dims.nx }}];
+    double x_current[NX];
     {%- for i in range(end=dims.nx) %}
     x_current[{{ i }}] = 0.0;
     {%- endfor %}
@@ -78,19 +83,19 @@ int main()
 
 
     // initial value for control input
-    double u0[{{ dims.nu }}];
+    double u0[NU];
     {%- for i in range(end=dims.nu) %}
     u0[{{ i }}] = 0.0;
     {%- endfor %}
 
   {%- if dims.np > 0 %}
     // set parameters
-    double p[{{ dims.np }}];
-    {% for item in parameter_values %}
+    double p[NP];
+    {%- for item in parameter_values %}
     p[{{ loop.index0 }}] = {{ item }};
-    {% endfor %}
+    {%- endfor %}
 
-    {{ model.name }}_acados_sim_update_params(capsule, p, {{ dims.np }});
+    {{ model.name }}_acados_sim_update_params(capsule, p, NP);
   {% endif %}{# if np > 0 #}
 
     int n_sim_steps = 3;
@@ -110,7 +115,7 @@ int main()
                acados_sim_out, "x", x_current);
         
         printf("\nx_current, %d\n", ii);
-        for (int jj = 0; jj < {{ dims.nx }}; jj++)
+        for (int jj = 0; jj < NX; jj++)
         {
             printf("%e\n", x_current[jj]);
         }
diff --git a/pyextra/acados_template/generate_c_code_constraint.py b/pyextra/acados_template/generate_c_code_constraint.py
index 93e919c56..c79ddc129 100644
--- a/pyextra/acados_template/generate_c_code_constraint.py
+++ b/pyextra/acados_template/generate_c_code_constraint.py
@@ -94,7 +94,7 @@ def generate_c_code_constraint( model, con_name, is_terminal, opts ):
         gen_dir = con_name + '_constraints'
         if not os.path.exists(gen_dir):
             os.mkdir(gen_dir)
-        gen_dir_location = './' + gen_dir
+        gen_dir_location = os.path.join('.', gen_dir)
         os.chdir(gen_dir_location)
 
         # export casadi functions
diff --git a/pyextra/acados_template/generate_c_code_discrete_dynamics.py b/pyextra/acados_template/generate_c_code_discrete_dynamics.py
index 531b5ed9d..334e18dab 100644
--- a/pyextra/acados_template/generate_c_code_discrete_dynamics.py
+++ b/pyextra/acados_template/generate_c_code_discrete_dynamics.py
@@ -80,7 +80,7 @@ def generate_c_code_discrete_dynamics( model, opts ):
     model_dir = model_name + '_model'
     if not os.path.exists(model_dir):
         os.mkdir(model_dir)
-    model_dir_location = './' + model_dir
+    model_dir_location = os.path.join('.', model_dir)
     os.chdir(model_dir_location)
 
     # set up & generate Functions
diff --git a/pyextra/acados_template/generate_c_code_explicit_ode.py b/pyextra/acados_template/generate_c_code_explicit_ode.py
index fb0c020b0..76e640053 100644
--- a/pyextra/acados_template/generate_c_code_explicit_ode.py
+++ b/pyextra/acados_template/generate_c_code_explicit_ode.py
@@ -105,7 +105,7 @@ def generate_c_code_explicit_ode( model, opts ):
     model_dir = model_name + '_model'
     if not os.path.exists(model_dir):
         os.mkdir(model_dir)
-    model_dir_location = './' + model_dir
+    model_dir_location = os.path.join('.', model_dir)
     os.chdir(model_dir_location)
     fun_name = model_name + '_expl_ode_fun'
     expl_ode_fun.generate(fun_name, casadi_opts)
diff --git a/pyextra/acados_template/generate_c_code_external_cost.py b/pyextra/acados_template/generate_c_code_external_cost.py
index bc21f85f7..839652261 100644
--- a/pyextra/acados_template/generate_c_code_external_cost.py
+++ b/pyextra/acados_template/generate_c_code_external_cost.py
@@ -58,6 +58,7 @@ def generate_c_code_external_cost(model, stage_type, opts):
         suffix_name_jac = "_cost_ext_cost_e_fun_jac"
         u = symbol("u", 0, 0)
         ext_cost = model.cost_expr_ext_cost_e
+        custom_hess = model.cost_expr_ext_cost_custom_hess_e
 
     elif stage_type == 'path':
         suffix_name = "_cost_ext_cost_fun"
@@ -65,6 +66,7 @@ def generate_c_code_external_cost(model, stage_type, opts):
         suffix_name_jac = "_cost_ext_cost_fun_jac"
         u = model.u
         ext_cost = model.cost_expr_ext_cost
+        custom_hess = model.cost_expr_ext_cost_custom_hess
 
     elif stage_type == 'initial':
         suffix_name = "_cost_ext_cost_0_fun"
@@ -72,6 +74,7 @@ def generate_c_code_external_cost(model, stage_type, opts):
         suffix_name_jac = "_cost_ext_cost_0_fun_jac"
         u = model.u
         ext_cost = model.cost_expr_ext_cost_0
+        custom_hess = model.cost_expr_ext_cost_custom_hess_0
 
     # set up functions to be exported
     fun_name = model.name + suffix_name
@@ -81,6 +84,9 @@ def generate_c_code_external_cost(model, stage_type, opts):
     # generate expression for full gradient and Hessian
     full_hess, grad = hessian(ext_cost, vertcat(u, x))
 
+    if custom_hess is not None:
+        full_hess = custom_hess
+
     ext_cost_fun = Function(fun_name, [x, u, p], [ext_cost])
     ext_cost_fun_jac_hess = Function(
         fun_name_hess, [x, u, p], [ext_cost, grad, full_hess]
diff --git a/pyextra/acados_template/generate_c_code_gnsf.py b/pyextra/acados_template/generate_c_code_gnsf.py
index 3203cbbcc..97acb8e33 100644
--- a/pyextra/acados_template/generate_c_code_gnsf.py
+++ b/pyextra/acados_template/generate_c_code_gnsf.py
@@ -54,7 +54,7 @@ def generate_c_code_gnsf( model, opts ):
     model_dir = model_name + '_model'
     if not os.path.exists(model_dir):
         os.mkdir(model_dir)
-    model_dir_location = './' + model_dir
+    model_dir_location = os.path.join('.', model_dir)
     os.chdir(model_dir_location)
 
     # obtain gnsf dimensions
diff --git a/pyextra/acados_template/generate_c_code_implicit_ode.py b/pyextra/acados_template/generate_c_code_implicit_ode.py
index e0138c69f..f2d50b43a 100644
--- a/pyextra/acados_template/generate_c_code_implicit_ode.py
+++ b/pyextra/acados_template/generate_c_code_implicit_ode.py
@@ -114,7 +114,7 @@ def generate_c_code_implicit_ode( model, opts ):
     model_dir = model_name + '_model'
     if not os.path.exists(model_dir):
         os.mkdir(model_dir)
-    model_dir_location = './' + model_dir
+    model_dir_location = os.path.join('.', model_dir)
     os.chdir(model_dir_location)
 
     fun_name = model_name + '_impl_dae_fun'
diff --git a/pyextra/acados_template/generate_c_code_nls_cost.py b/pyextra/acados_template/generate_c_code_nls_cost.py
index cdb0bf001..ffcd78ca7 100644
--- a/pyextra/acados_template/generate_c_code_nls_cost.py
+++ b/pyextra/acados_template/generate_c_code_nls_cost.py
@@ -76,7 +76,7 @@ def generate_c_code_nls_cost( model, cost_name, stage_type, opts ):
     gen_dir = cost_name + '_cost'
     if not os.path.exists(gen_dir):
         os.mkdir(gen_dir)
-    gen_dir_location = './' + gen_dir
+    gen_dir_location = os.path.join('.', gen_dir)
     os.chdir(gen_dir_location)
 
     # set up expressions
diff --git a/pyextra/acados_template/utils.py b/pyextra/acados_template/utils.py
index 3f002b98c..bf8ae4d5d 100644
--- a/pyextra/acados_template/utils.py
+++ b/pyextra/acados_template/utils.py
@@ -45,7 +45,7 @@ def get_acados_path():
     ACADOS_PATH = os.environ.get('ACADOS_SOURCE_DIR')
     if not ACADOS_PATH:
         acados_template_path = os.path.dirname(os.path.abspath(__file__))
-        acados_path = os.path.join(acados_template_path, '../../../')
+        acados_path = os.path.join(acados_template_path, '..','..','..')
         ACADOS_PATH = os.path.realpath(acados_path)
         msg = 'Warning: Did not find environment variable ACADOS_SOURCE_DIR, '
         msg += 'guessed ACADOS_PATH to be {}.\n'.format(ACADOS_PATH)
@@ -54,10 +54,20 @@ def get_acados_path():
     return ACADOS_PATH
 
 
+def get_python_interface_path():
+    ACADOS_PYTHON_INTERFACE_PATH = os.environ.get('ACADOS_PYTHON_INTERFACE_PATH')
+    if not ACADOS_PYTHON_INTERFACE_PATH:
+        acados_path = get_acados_path()
+        ACADOS_PYTHON_INTERFACE_PATH = os.path.join(acados_path, 'interfaces', 'acados_template', 'acados_template')
+    return ACADOS_PYTHON_INTERFACE_PATH
+
+
 def get_tera_exec_path():
     TERA_PATH = os.environ.get('TERA_PATH')
     if not TERA_PATH:
-        TERA_PATH = os.path.join(get_acados_path(), 'bin/t_renderer')
+        TERA_PATH = os.path.join(get_acados_path(), 'bin', 't_renderer')
+        if os.name == 'nt':
+            TERA_PATH += '.exe'
     return TERA_PATH
 
 
@@ -199,9 +209,7 @@ def render_template(in_file, out_file, template_dir, json_path):
 
     # setting up loader and environment
     acados_path = os.path.dirname(os.path.abspath(__file__))
-
-    template_glob = acados_path + '/c_templates_tera/*'
-    acados_template_path = acados_path + '/c_templates_tera'
+    template_glob = os.path.join(acados_path, 'c_templates_tera', '*')
 
     # call tera as system cmd
     os_cmd = "{tera_path} '{template_glob}' '{in_file}' '{json_path}' '{out_file}'".format(
@@ -213,7 +221,7 @@ def render_template(in_file, out_file, template_dir, json_path):
     )
     status = os.system(os_cmd)
     if (status != 0):
-        raise Exception('Rendering of {} failed! Exiting.\n'.format(in_file))
+        raise Exception('Rendering of {} failed!\n\nAttempted to execute OS command:\n{}\n\nExiting.\n'.format(in_file, os_cmd))
 
     os.chdir(cwd)
 
@@ -262,6 +270,14 @@ def acados_class2dict(class_instance):
     return out
 
 
+def get_ocp_nlp_layout():
+    python_interface_path = get_python_interface_path()
+    abs_path = os.path.join(python_interface_path, 'acados_layout.json')
+    with open(abs_path, 'r') as f:
+        ocp_nlp_layout = json.load(f)
+    return ocp_nlp_layout
+
+
 def ocp_check_against_layout(ocp_nlp, ocp_dims):
     """
     Check dimensions against layout
@@ -273,11 +289,7 @@ def ocp_check_against_layout(ocp_nlp, ocp_dims):
     ocp_dims : instance of AcadosOcpDims
     """
 
-    # load JSON layout
-    current_module = sys.modules[__name__]
-    acados_path = os.path.dirname(current_module.__file__)
-    with open(acados_path + '/acados_layout.json', 'r') as f:
-        ocp_nlp_layout = json.load(f)
+    ocp_nlp_layout = get_ocp_nlp_layout()
 
     ocp_check_against_layout_recursion(ocp_nlp, ocp_dims, ocp_nlp_layout)
     return
diff --git a/selfdrive/controls/lib/lateral_mpc_lib/SConscript b/selfdrive/controls/lib/lateral_mpc_lib/SConscript
index 4ebb5cd4e..148e4e123 100644
--- a/selfdrive/controls/lib/lateral_mpc_lib/SConscript
+++ b/selfdrive/controls/lib/lateral_mpc_lib/SConscript
@@ -1,4 +1,4 @@
-Import('env', 'arch')
+Import('env', 'envCython', 'arch', 'common')
 
 gen = "c_generated_code"
 
@@ -33,6 +33,7 @@ generated_files = [
 
   f'{gen}/main_lat.c',
   f'{gen}/acados_solver_lat.h',
+  f'{gen}/acados_solver.pxd',
 
   f'{gen}/lat_model/lat_expl_vde_adj.c',
 
@@ -53,6 +54,24 @@ lenv["CFLAGS"].append("-DACADOS_WITH_QPOASES")
 lenv["CXXFLAGS"].append("-DACADOS_WITH_QPOASES")
 lenv["CCFLAGS"].append("-Wno-unused")
 lenv["LINKFLAGS"].append("-Wl,--disable-new-dtags")
-lenv.SharedLibrary(f"{gen}/acados_ocp_solver_lat",
-                   build_files,
-                   LIBS=['m', 'acados', 'hpipm', 'blasfeo', 'qpOASES_e'])
+lib_solver = lenv.SharedLibrary(f"{gen}/acados_ocp_solver_lat",
+                                build_files,
+                                LIBS=['m', 'acados', 'hpipm', 'blasfeo', 'qpOASES_e'])
+
+# generate cython stuff
+acados_ocp_solver_pyx = File("#pyextra/acados_template/acados_ocp_solver_pyx.pyx")
+acados_ocp_solver_common = File("#pyextra/acados_template/acados_solver_common.pxd")
+libacados_ocp_solver_pxd = File(f'{gen}/acados_solver.pxd')
+libacados_ocp_solver_c = File(f'{gen}/acados_ocp_solver_pyx.c')
+
+lenv2 = envCython.Clone()
+lenv2["LINKFLAGS"] += [lib_solver[0].get_labspath()]
+lenv2.Command(libacados_ocp_solver_c,
+  [acados_ocp_solver_pyx, acados_ocp_solver_common, libacados_ocp_solver_pxd],
+  f'cython' + \
+  f' -o {libacados_ocp_solver_c.get_labspath()}' + \
+  f' -I {libacados_ocp_solver_pxd.get_dir().get_labspath()}' + \
+  f' -I {acados_ocp_solver_common.get_dir().get_labspath()}' + \
+  f' {acados_ocp_solver_pyx.get_labspath()}')
+lib_cython = lenv2.Program(f'{gen}/acados_ocp_solver_pyx.so', [libacados_ocp_solver_c])
+lenv2.Depends(lib_cython, lib_solver)
diff --git a/selfdrive/controls/lib/lateral_mpc_lib/lat_mpc.py b/selfdrive/controls/lib/lateral_mpc_lib/lat_mpc.py
index 42cef8b89..6162cf68a 100755
--- a/selfdrive/controls/lib/lateral_mpc_lib/lat_mpc.py
+++ b/selfdrive/controls/lib/lateral_mpc_lib/lat_mpc.py
@@ -5,8 +5,12 @@ import numpy as np
 from casadi import SX, vertcat, sin, cos
 from selfdrive.controls.lib.drive_helpers import LAT_MPC_N as N
 from selfdrive.controls.lib.drive_helpers import T_IDXS
-from pyextra.acados_template import AcadosModel, AcadosOcp, AcadosOcpSolver
 
+if __name__ == '__main__':  # generating code
+  from pyextra.acados_template import AcadosModel, AcadosOcp, AcadosOcpSolver
+else:
+  # from pyextra.acados_template import AcadosOcpSolverFast
+  from selfdrive.controls.lib.lateral_mpc_lib.c_generated_code.acados_ocp_solver_pyx import AcadosOcpSolverFast  # pylint: disable=no-name-in-module, import-error
 
 LAT_MPC_DIR = os.path.dirname(os.path.abspath(__file__))
 EXPORT_DIR = os.path.join(LAT_MPC_DIR, "c_generated_code")
@@ -110,17 +114,16 @@ def gen_lat_mpc_solver():
 
 class LateralMpc():
   def __init__(self, x0=np.zeros(X_DIM)):
-    self.solver = AcadosOcpSolver('lat', N, EXPORT_DIR)
+    self.solver = AcadosOcpSolverFast('lat', N, EXPORT_DIR)
     self.reset(x0)
 
   def reset(self, x0=np.zeros(X_DIM)):
     self.x_sol = np.zeros((N+1, X_DIM))
     self.u_sol = np.zeros((N, 1))
     self.yref = np.zeros((N+1, 3))
-    self.solver.cost_set_slice(0, N, "yref", self.yref[:N])
+    for i in range(N):
+      self.solver.cost_set(i, "yref", self.yref[i])
     self.solver.cost_set(N, "yref", self.yref[N][:2])
-    W = np.eye(3)
-    self.Ws = np.tile(W[None], reps=(N,1,1))
 
     # Somehow needed for stable init
     for i in range(N+1):
@@ -132,12 +135,11 @@ class LateralMpc():
     self.cost = 0
 
   def set_weights(self, path_weight, heading_weight, steer_rate_weight):
-    self.Ws[:,0,0] = path_weight
-    self.Ws[:,1,1] = heading_weight
-    self.Ws[:,2,2] = steer_rate_weight
-    self.solver.cost_set_slice(0, N, 'W', self.Ws, api='old')
+    W = np.asfortranarray(np.diag([path_weight, heading_weight, steer_rate_weight]))
+    for i in range(N):
+      self.solver.cost_set(i, 'W', W)
     #TODO hacky weights to keep behavior the same
-    self.solver.cost_set(N, 'W', (3/20.)*self.Ws[0,:2,:2])
+    self.solver.cost_set(N, 'W', (3/20.)*W[:2,:2])
 
   def run(self, x0, v_ego, car_rotation_radius, y_pts, heading_pts):
     x0_cp = np.copy(x0)
@@ -145,12 +147,15 @@ class LateralMpc():
     self.solver.constraints_set(0, "ubx", x0_cp)
     self.yref[:,0] = y_pts
     self.yref[:,1] = heading_pts*(v_ego+5.0)
-    self.solver.cost_set_slice(0, N, "yref", self.yref[:N])
+    for i in range(N):
+      self.solver.cost_set(i, "yref", self.yref[i])
     self.solver.cost_set(N, "yref", self.yref[N][:2])
 
     self.solution_status = self.solver.solve()
-    self.solver.fill_in_slice(0, N+1, 'x', self.x_sol)
-    self.solver.fill_in_slice(0, N, 'u', self.u_sol)
+    for i in range(N+1):
+      self.x_sol[i] = self.solver.get(i, 'x')
+    for i in range(N):
+      self.u_sol[i] = self.solver.get(i, 'u')
     self.cost = self.solver.get_cost()
 
 
diff --git a/selfdrive/controls/lib/longitudinal_mpc_lib/SConscript b/selfdrive/controls/lib/longitudinal_mpc_lib/SConscript
index 70a3e72b1..4abe90f8f 100644
--- a/selfdrive/controls/lib/longitudinal_mpc_lib/SConscript
+++ b/selfdrive/controls/lib/longitudinal_mpc_lib/SConscript
@@ -1,4 +1,4 @@
-Import('env', 'arch')
+Import('env', 'envCython', 'arch', 'common')
 
 gen = "c_generated_code"
 
@@ -41,6 +41,7 @@ generated_files = [
 
   f'{gen}/main_long.c',
   f'{gen}/acados_solver_long.h',
+  f'{gen}/acados_solver.pxd',
 
   f'{gen}/long_model/long_expl_vde_adj.c',
 
@@ -63,6 +64,24 @@ lenv["CFLAGS"].append("-DACADOS_WITH_QPOASES")
 lenv["CXXFLAGS"].append("-DACADOS_WITH_QPOASES")
 lenv["CCFLAGS"].append("-Wno-unused")
 lenv["LINKFLAGS"].append("-Wl,--disable-new-dtags")
-lenv.SharedLibrary(f"{gen}/acados_ocp_solver_long",
-                   build_files,
-                   LIBS=['m', 'acados', 'hpipm', 'blasfeo', 'qpOASES_e'])
+lib_solver = lenv.SharedLibrary(f"{gen}/acados_ocp_solver_long",
+                                build_files,
+                                LIBS=['m', 'acados', 'hpipm', 'blasfeo', 'qpOASES_e'])
+
+# generate cython stuff
+acados_ocp_solver_pyx = File("#pyextra/acados_template/acados_ocp_solver_pyx.pyx")
+acados_ocp_solver_common = File("#pyextra/acados_template/acados_solver_common.pxd")
+libacados_ocp_solver_pxd = File(f'{gen}/acados_solver.pxd')
+libacados_ocp_solver_c = File(f'{gen}/acados_ocp_solver_pyx.c')
+
+lenv2 = envCython.Clone()
+lenv2["LINKFLAGS"] += [lib_solver[0].get_labspath()]
+lenv2.Command(libacados_ocp_solver_c,
+  [acados_ocp_solver_pyx, acados_ocp_solver_common, libacados_ocp_solver_pxd],
+  f'cython' + \
+  f' -o {libacados_ocp_solver_c.get_labspath()}' + \
+  f' -I {libacados_ocp_solver_pxd.get_dir().get_labspath()}' + \
+  f' -I {acados_ocp_solver_common.get_dir().get_labspath()}' + \
+  f' {acados_ocp_solver_pyx.get_labspath()}')
+lib_cython = lenv2.Program(f'{gen}/acados_ocp_solver_pyx.so', [libacados_ocp_solver_c])
+lenv2.Depends(lib_cython, lib_solver)
diff --git a/selfdrive/controls/lib/longitudinal_mpc_lib/long_mpc.py b/selfdrive/controls/lib/longitudinal_mpc_lib/long_mpc.py
index b3960fcb5..df6d8e0fb 100644
--- a/selfdrive/controls/lib/longitudinal_mpc_lib/long_mpc.py
+++ b/selfdrive/controls/lib/longitudinal_mpc_lib/long_mpc.py
@@ -8,7 +8,12 @@ from selfdrive.swaglog import cloudlog
 from selfdrive.modeld.constants import index_function
 from selfdrive.controls.lib.radar_helpers import _LEAD_ACCEL_TAU
 
-from pyextra.acados_template import AcadosModel, AcadosOcp, AcadosOcpSolver
+if __name__ == '__main__':  # generating code
+  from pyextra.acados_template import AcadosModel, AcadosOcp, AcadosOcpSolver
+else:
+  # from pyextra.acados_template import AcadosOcpSolver as AcadosOcpSolverFast
+  from selfdrive.controls.lib.longitudinal_mpc_lib.c_generated_code.acados_ocp_solver_pyx import AcadosOcpSolverFast  # pylint: disable=no-name-in-module, import-error
+
 from casadi import SX, vertcat
 
 LONG_MPC_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -190,13 +195,14 @@ class LongitudinalMpc():
     self.source = SOURCES[2]
 
   def reset(self):
-    self.solver = AcadosOcpSolver('long', N, EXPORT_DIR)
+    self.solver = AcadosOcpSolverFast('long', N, EXPORT_DIR)
     self.v_solution = [0.0 for i in range(N+1)]
     self.a_solution = [0.0 for i in range(N+1)]
     self.j_solution = [0.0 for i in range(N)]
     self.yref = np.zeros((N+1, COST_DIM))
-    self.solver.cost_set_slice(0, N, "yref", self.yref[:N])
-    self.solver.set(N, "yref", self.yref[N][:COST_E_DIM])
+    for i in range(N):
+      self.solver.cost_set(i, "yref", self.yref[i])
+    self.solver.cost_set(N, "yref", self.yref[N][:COST_E_DIM])
     self.x_sol = np.zeros((N+1, X_DIM))
     self.u_sol = np.zeros((N,1))
     self.params = np.zeros((N+1,3))
@@ -216,30 +222,30 @@ class LongitudinalMpc():
       self.set_weights_for_lead_policy()
 
   def set_weights_for_lead_policy(self):
-    W = np.diag([X_EGO_OBSTACLE_COST, X_EGO_COST, V_EGO_COST, A_EGO_COST, J_EGO_COST])
-    Ws = np.tile(W[None], reps=(N,1,1))
-    self.solver.cost_set_slice(0, N, 'W', Ws, api='old')
+    W = np.asfortranarray(np.diag([X_EGO_OBSTACLE_COST, X_EGO_COST, V_EGO_COST, A_EGO_COST, J_EGO_COST]))
+    for i in range(N):
+      self.solver.cost_set(i, 'W', W)
     # Setting the slice without the copy make the array not contiguous,
     # causing issues with the C interface.
     self.solver.cost_set(N, 'W', np.copy(W[:COST_E_DIM, :COST_E_DIM]))
 
     # Set L2 slack cost on lower bound constraints
     Zl = np.array([LIMIT_COST, LIMIT_COST, LIMIT_COST, DANGER_ZONE_COST])
-    Zls = np.tile(Zl[None], reps=(N+1,1,1))
-    self.solver.cost_set_slice(0, N+1, 'Zl', Zls, api='old')
+    for i in range(N):
+      self.solver.cost_set(i, 'Zl', Zl)
 
   def set_weights_for_xva_policy(self):
-    W = np.diag([0., 10., 1., 10., 1.])
-    Ws = np.tile(W[None], reps=(N,1,1))
-    self.solver.cost_set_slice(0, N, 'W', Ws, api='old')
+    W = np.asfortranarray(np.diag([0., 10., 1., 10., 1.]))
+    for i in range(N):
+      self.solver.cost_set(i, 'W', W)
     # Setting the slice without the copy make the array not contiguous,
     # causing issues with the C interface.
     self.solver.cost_set(N, 'W', np.copy(W[:COST_E_DIM, :COST_E_DIM]))
 
     # Set L2 slack cost on lower bound constraints
     Zl = np.array([LIMIT_COST, LIMIT_COST, LIMIT_COST, 0.0])
-    Zls = np.tile(Zl[None], reps=(N+1,1,1))
-    self.solver.cost_set_slice(0, N+1, 'Zl', Zls, api='old')
+    for i in range(N):
+      self.solver.cost_set(i, 'Zl', Zl)
 
   def set_cur_state(self, v, a):
     if abs(self.x0[1] - v) > 1.:
@@ -326,8 +332,9 @@ class LongitudinalMpc():
     self.yref[:,1] = x
     self.yref[:,2] = v
     self.yref[:,3] = a
-    self.solver.cost_set_slice(0, N, "yref", self.yref[:N], api='old')
-    self.solver.set(N, "yref", self.yref[N][:COST_E_DIM])
+    for i in range(N):
+      self.solver.cost_set(i, "yref", self.yref[i])
+    self.solver.cost_set(N, "yref", self.yref[N][:COST_E_DIM])
     self.accel_limit_arr[:,0] = -10.
     self.accel_limit_arr[:,1] = 10.
     x_obstacle = 1e5*np.ones((N+1))
@@ -338,12 +345,14 @@ class LongitudinalMpc():
 
   def run(self):
     for i in range(N+1):
-      self.solver.set_param(i, self.params[i])
+      self.solver.set(i, 'p', self.params[i])
     self.solver.constraints_set(0, "lbx", self.x0)
     self.solver.constraints_set(0, "ubx", self.x0)
     self.solution_status = self.solver.solve()
-    self.solver.fill_in_slice(0, N+1, 'x', self.x_sol)
-    self.solver.fill_in_slice(0, N, 'u', self.u_sol)
+    for i in range(N+1):
+      self.x_sol[i] = self.solver.get(i, 'x')
+    for i in range(N):
+      self.u_sol[i] = self.solver.get(i, 'u')
 
     self.v_solution = self.x_sol[:,1]
     self.a_solution = self.x_sol[:,2]
diff --git a/third_party/acados/aarch64/lib/libacados.so b/third_party/acados/aarch64/lib/libacados.so
index 248e8c8fa..fe961d8c5 100755
Binary files a/third_party/acados/aarch64/lib/libacados.so and b/third_party/acados/aarch64/lib/libacados.so differ
diff --git a/third_party/acados/aarch64/lib/libblasfeo.so b/third_party/acados/aarch64/lib/libblasfeo.so
index 38cfb0e9e..a59f3f79f 100755
Binary files a/third_party/acados/aarch64/lib/libblasfeo.so and b/third_party/acados/aarch64/lib/libblasfeo.so differ
diff --git a/third_party/acados/aarch64/lib/libhpipm.so b/third_party/acados/aarch64/lib/libhpipm.so
index fa7eae088..6916c9c71 100755
Binary files a/third_party/acados/aarch64/lib/libhpipm.so and b/third_party/acados/aarch64/lib/libhpipm.so differ
diff --git a/third_party/acados/build.sh b/third_party/acados/build.sh
index 749ac8291..9216032e9 100755
--- a/third_party/acados/build.sh
+++ b/third_party/acados/build.sh
@@ -18,7 +18,7 @@ if [ ! -d acados_repo/ ]; then
 fi
 cd acados_repo
 git fetch
-git checkout 43ba28e95062f9ac9b48facd3b45698d57666fa3
+git checkout 79e9e3e76f2751198858adf382c97837833ad31f
 git submodule update --recursive --init
 
 # build
diff --git a/third_party/acados/include/acados/ocp_nlp/ocp_nlp_common.h b/third_party/acados/include/acados/ocp_nlp/ocp_nlp_common.h
index 040763381..64939b9ed 100644
--- a/third_party/acados/include/acados/ocp_nlp/ocp_nlp_common.h
+++ b/third_party/acados/include/acados/ocp_nlp/ocp_nlp_common.h
@@ -135,6 +135,8 @@ typedef struct ocp_nlp_dims
     int *nz;  // number of algebraic variables
     int *ns;  // number of slack variables
     int N;    // number of shooting nodes
+
+    void *raw_memory; // Pointer to allocated memory, to be used for freeing
 } ocp_nlp_dims;
 
 //
@@ -203,6 +205,9 @@ typedef struct ocp_nlp_in
     /// Pointers to constraints functions (TBC).
     void **constraints;
 
+    /// Pointer to allocated memory, to be used for freeing.
+    void *raw_memory;
+
 } ocp_nlp_in;
 
 //
@@ -235,6 +240,8 @@ typedef struct ocp_nlp_out
     double inf_norm_res;
     double total_time;
 
+    void *raw_memory; // Pointer to allocated memory, to be used for freeing
+
 } ocp_nlp_out;
 
 //
diff --git a/third_party/acados/include/acados/sim/sim_collocation_utils.h b/third_party/acados/include/acados/sim/sim_collocation_utils.h
index 850dab50d..40a0b1c0c 100644
--- a/third_party/acados/include/acados/sim/sim_collocation_utils.h
+++ b/third_party/acados/include/acados/sim/sim_collocation_utils.h
@@ -43,44 +43,53 @@ extern "C" {
 
 
 
-enum Newton_type_collocation
+// enum Newton_type_collocation
+// {
+//     exact = 0,
+//     simplified_in,
+//     simplified_inis
+// };
+
+
+
+// typedef struct
+// {
+//     enum Newton_type_collocation type;
+//     double *eig;
+//     double *low_tria;
+//     bool single;
+//     bool freeze;
+
+//     double *transf1;
+//     double *transf2;
+
+//     double *transf1_T;
+//     double *transf2_T;
+// } Newton_scheme;
+
+
+typedef enum
 {
-    exact = 0,
-    simplified_in,
-    simplified_inis
-};
-
-
-
-typedef struct
-{
-    enum Newton_type_collocation type;
-    double *eig;
-    double *low_tria;
-    bool single;
-    bool freeze;
-
-    double *transf1;
-    double *transf2;
-
-    double *transf1_T;
-    double *transf2_T;
-} Newton_scheme;
-
+    GAUSS_LEGENDRE,
+    GAUSS_RADAU_IIA,
+} sim_collocation_type;
 
 
 //
-acados_size_t gauss_nodes_work_calculate_size(int ns);
+// acados_size_t gauss_legendre_nodes_work_calculate_size(int ns);
 //
-void gauss_nodes(int ns, double *nodes, void *raw_memory);
+// void gauss_legendre_nodes(int ns, double *nodes, void *raw_memory);
 //
-acados_size_t gauss_simplified_work_calculate_size(int ns);
+// acados_size_t gauss_simplified_work_calculate_size(int ns);
+// //
+// void gauss_simplified(int ns, Newton_scheme *scheme, void *work);
 //
-void gauss_simplified(int ns, Newton_scheme *scheme, void *work);
+acados_size_t butcher_tableau_work_calculate_size(int ns);
 //
-acados_size_t butcher_table_work_calculate_size(int ns);
+// void calculate_butcher_tableau_from_nodes(int ns, double *nodes, double *b, double *A, void *work);
 //
-void butcher_table(int ns, double *nodes, double *b, double *A, void *work);
+void calculate_butcher_tableau(int ns, sim_collocation_type collocation_type, double *c_vec,
+                               double *b_vec, double *A_mat, void *work);
 
 
 
diff --git a/third_party/acados/include/acados/sim/sim_common.h b/third_party/acados/include/acados/sim/sim_common.h
index 177f0893d..1838d76f8 100644
--- a/third_party/acados/include/acados/sim/sim_common.h
+++ b/third_party/acados/include/acados/sim/sim_common.h
@@ -141,12 +141,13 @@ typedef struct
     bool output_z;        // 1 -- if zn should be computed
     bool sens_algebraic;  // 1 -- if S_algebraic should be computed
     bool exact_z_output;  // 1 -- if z, S_algebraic should be computed exactly, extra Newton iterations
+    sim_collocation_type collocation_type;
 
     // for explicit integrators: newton_iter == 0 && scheme == NULL
     // && jac_reuse=false
     int newton_iter;
     bool jac_reuse;
-    Newton_scheme *scheme;
+    // Newton_scheme *scheme;
 
     // workspace
     void *work;
diff --git a/third_party/acados/include/acados/utils/math.h b/third_party/acados/include/acados/utils/math.h
index ac0d3263e..fe1da875f 100644
--- a/third_party/acados/include/acados/utils/math.h
+++ b/third_party/acados/include/acados/utils/math.h
@@ -40,7 +40,7 @@ extern "C" {
 
 #include "acados/utils/types.h"
 
-#if defined(__DSPACE__)
+#if defined(__MABX2__)
 double fmax(double a, double b);
 #endif
 
diff --git a/third_party/acados/include/acados/utils/timing.h b/third_party/acados/include/acados/utils/timing.h
index 823b763f6..fe561d389 100644
--- a/third_party/acados/include/acados/utils/timing.h
+++ b/third_party/acados/include/acados/utils/timing.h
@@ -67,7 +67,7 @@ typedef struct acados_timer_
     mach_timebase_info_data_t tinfo;
 } acados_timer;
 
-#elif defined(__DSPACE__)
+#elif defined(__MABX2__)
 
 #include <brtenv.h>
 
diff --git a/third_party/acados/include/acados_c/ocp_nlp_interface.h b/third_party/acados/include/acados_c/ocp_nlp_interface.h
index 0f250a662..b6b893024 100644
--- a/third_party/acados/include/acados_c/ocp_nlp_interface.h
+++ b/third_party/acados/include/acados_c/ocp_nlp_interface.h
@@ -227,10 +227,8 @@ int ocp_nlp_dynamics_model_set(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_n
 /// \param field The name of the field, either nls_res_jac,
 ///     y_ref, W (others TBC)
 /// \param value Cost values.
-int ocp_nlp_cost_model_set_slice(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_in *in,
-        int start_stage, int end_stage, const char *field, void *value, int dim);
 int ocp_nlp_cost_model_set(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_in *in,
-        int start_stage, const char *field, void *value);
+        int stage, const char *field, void *value);
 
 
 /// Sets the function pointers to the constraints functions for the given stage.
@@ -241,8 +239,6 @@ int ocp_nlp_cost_model_set(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_i
 /// \param stage Stage number.
 /// \param field The name of the field, either lb, ub (others TBC)
 /// \param value Constraints function or values.
-int ocp_nlp_constraints_model_set_slice(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_in *in,
-        int start_stage, int end_stage, const char *field, void *value, int dim);
 int ocp_nlp_constraints_model_set(ocp_nlp_config *config, ocp_nlp_dims *dims,
         ocp_nlp_in *in, int stage, const char *field, void *value);
 
@@ -283,9 +279,6 @@ void ocp_nlp_out_set(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_out *ou
 void ocp_nlp_out_get(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_out *out,
         int stage, const char *field, void *value);
 
-void ocp_nlp_out_get_slice(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_out *out,
-        int start_stage, int end_stage, const char *field, void *value);
-
 //
 void ocp_nlp_get_at_stage(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_solver *solver,
         int stage, const char *field, void *value);
@@ -300,6 +293,8 @@ void ocp_nlp_constraint_dims_get_from_attr(ocp_nlp_config *config, ocp_nlp_dims
 void ocp_nlp_cost_dims_get_from_attr(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_out *out,
         int stage, const char *field, int *dims_out);
 
+void ocp_nlp_dynamics_dims_get_from_attr(ocp_nlp_config *config, ocp_nlp_dims *dims, ocp_nlp_out *out,
+        int stage, const char *field, int *dims_out);
 
 /* opts */
 
@@ -374,6 +369,8 @@ int ocp_nlp_precompute(ocp_nlp_solver *solver, ocp_nlp_in *nlp_in, ocp_nlp_out *
 /// \param nlp_out The output struct.
 void ocp_nlp_eval_cost(ocp_nlp_solver *solver, ocp_nlp_in *nlp_in, ocp_nlp_out *nlp_out);
 
+//
+void ocp_nlp_eval_residuals(ocp_nlp_solver *solver, ocp_nlp_in *nlp_in, ocp_nlp_out *nlp_out);
 
 //
 void ocp_nlp_eval_param_sens(ocp_nlp_solver *solver, char *field, int stage, int index, ocp_nlp_out *sens_nlp_out);
diff --git a/third_party/acados/include/acados_c/sim_interface.h b/third_party/acados/include/acados_c/sim_interface.h
index ba2e67bd6..cc47b62ed 100644
--- a/third_party/acados/include/acados_c/sim_interface.h
+++ b/third_party/acados/include/acados_c/sim_interface.h
@@ -45,11 +45,11 @@ extern "C" {
 
 typedef enum
 {
-	ERK,
-	IRK,
-	GNSF,
-	LIFTED_IRK,
-	INVALID_SIM_SOLVER,
+    ERK,
+    IRK,
+    GNSF,
+    LIFTED_IRK,
+    INVALID_SIM_SOLVER,
 } sim_solver_t;
 
 
diff --git a/third_party/acados/include/blasfeo/include/blasfeo_block_size.h b/third_party/acados/include/blasfeo/include/blasfeo_block_size.h
index 182cc71b1..0ccd4f1bb 100644
--- a/third_party/acados/include/blasfeo/include/blasfeo_block_size.h
+++ b/third_party/acados/include/blasfeo/include/blasfeo_block_size.h
@@ -43,7 +43,28 @@
 
 
 
-#if defined( TARGET_X64_INTEL_HASWELL )
+#if defined( TARGET_X64_INTEL_SKYLAKE_X )
+// common
+#define CACHE_LINE_SIZE 64 // data cache size: 64 bytes
+#define L1_CACHE_SIZE (32*1024) // L1 data cache size: 32 kB, 8-way
+#define L2_CACHE_SIZE (256*1024) //(1024*1024) // L2 data cache size: 1 MB ; DTLB1 64*4 kB = 256 kB
+#define LLC_CACHE_SIZE (6*1024*1024) //(8*1024*1024) // LLC cache size: 8 MB ; TLB 1536*4 kB = 6 MB
+// double
+#define D_PS 8 // panel size
+#define D_PLD 8 // 4 // GCD of panel length
+#define D_M_KERNEL 24 // max kernel size
+#define D_KC 128 //256 // 192
+#define D_NC 144 //72 //96 //72 // 120 // 512
+#define D_MC 2400 // 6000
+// single
+#define S_PS 16 // panel size
+#define S_PLD 4 // GCD of panel length TODO probably 16 when writing assebly
+#define S_M_KERNEL 32 // max kernel size
+#define S_KC 128 //256
+#define S_NC 128 //144
+#define S_MC 3000
+
+#elif defined( TARGET_X64_INTEL_HASWELL )
 // common
 #define CACHE_LINE_SIZE 64 // data cache size: 64 bytes
 #define L1_CACHE_SIZE (32*1024) // L1 data cache size: 32 kB, 8-way
diff --git a/third_party/acados/include/blasfeo/include/blasfeo_d_blasfeo_api.h b/third_party/acados/include/blasfeo/include/blasfeo_d_blasfeo_api.h
index f27264e13..15915f436 100644
--- a/third_party/acados/include/blasfeo/include/blasfeo_d_blasfeo_api.h
+++ b/third_party/acados/include/blasfeo/include/blasfeo_d_blasfeo_api.h
@@ -101,18 +101,18 @@ void blasfeo_dtrsv_ltu(int m, struct blasfeo_dmat *sA, int ai, int aj, struct bl
 void blasfeo_dtrsv_unn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
 // z <= inv( A^T ) * x, A (m)x(m) upper, transposed, not_unit
 void blasfeo_dtrsv_utn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
+// z <= A * x ; A lower triangular
+void blasfeo_dtrmv_lnn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
+// z <= A * x ; A lower triangular, unit diagonal
+void blasfeo_dtrmv_lnu(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
+// z <= A^T * x ; A lower triangular
+void blasfeo_dtrmv_ltn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
+// z <= A^T * x ; A lower triangular, unit diagonal
+void blasfeo_dtrmv_ltu(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
 // z <= beta * y + alpha * A * x ; A upper triangular
 void blasfeo_dtrmv_unn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
 // z <= A^T * x ; A upper triangular
 void blasfeo_dtrmv_utn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
-// z <= A * x ; A lower triangular
-void blasfeo_dtrmv_lnn(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
-// z <= A^T * x ; A lower triangular
-void blasfeo_dtrmv_ltn(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
-// z <= A * x ; A lower triangular, unit diagonal
-void blasfeo_dtrmv_lnu(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
-// z <= A^T * x ; A lower triangular, unit diagonal
-void blasfeo_dtrmv_ltu(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
 // z_n <= beta_n * y_n + alpha_n * A  * x_n
 // z_t <= beta_t * y_t + alpha_t * A^T * x_t
 void blasfeo_dgemv_nt(int m, int n, double alpha_n, double alpha_t, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx_n, int xi_n, struct blasfeo_dvec *sx_t, int xi_t, double beta_n, double beta_t, struct blasfeo_dvec *sy_n, int yi_n, struct blasfeo_dvec *sy_t, int yi_t, struct blasfeo_dvec *sz_n, int zi_n, struct blasfeo_dvec *sz_t, int zi_t);
diff --git a/third_party/acados/include/blasfeo/include/blasfeo_d_blasfeo_ref_api.h b/third_party/acados/include/blasfeo/include/blasfeo_d_blasfeo_ref_api.h
index eaac36581..5773715cf 100644
--- a/third_party/acados/include/blasfeo/include/blasfeo_d_blasfeo_ref_api.h
+++ b/third_party/acados/include/blasfeo/include/blasfeo_d_blasfeo_ref_api.h
@@ -101,23 +101,24 @@ void blasfeo_ref_dtrsv_ltu(int m, struct blasfeo_dmat *sA, int ai, int aj, struc
 void blasfeo_ref_dtrsv_unn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
 // z <= inv( A' ) * x, A (m)x(m) upper, transposed, not_unit
 void blasfeo_ref_dtrsv_utn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
+// z <= A * x ; A lower triangular
+void blasfeo_ref_dtrmv_lnn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
+// z <= A * x ; A lower triangular, unit diagonal
+void blasfeo_ref_dtrmv_lnu(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
+// z <= A' * x ; A lower triangular
+void blasfeo_ref_dtrmv_ltn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
+// z <= A' * x ; A lower triangular, unit diagonal
+void blasfeo_ref_dtrmv_ltu(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
 // z <= beta * y + alpha * A * x ; A upper triangular
 void blasfeo_ref_dtrmv_unn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
 // z <= A' * x ; A upper triangular
 void blasfeo_ref_dtrmv_utn(int m, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
-// z <= A * x ; A lower triangular
-void blasfeo_ref_dtrmv_lnn(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
-// z <= A' * x ; A lower triangular
-void blasfeo_ref_dtrmv_ltn(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
-// z <= A * x ; A lower triangular, unit diagonal
-void blasfeo_ref_dtrmv_lnu(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
-// z <= A' * x ; A lower triangular, unit diagonal
-void blasfeo_ref_dtrmv_ltu(int m, int n, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, struct blasfeo_dvec *sz, int zi);
 // z_n <= beta_n * y_n + alpha_n * A  * x_n
 // z_t <= beta_t * y_t + alpha_t * A' * x_t
 void blasfeo_ref_dgemv_nt(int m, int n, double alpha_n, double alpha_t, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx_n, int xi_n, struct blasfeo_dvec *sx_t, int xi_t, double beta_n, double beta_t, struct blasfeo_dvec *sy_n, int yi_n, struct blasfeo_dvec *sy_t, int yi_t, struct blasfeo_dvec *sz_n, int zi_n, struct blasfeo_dvec *sz_t, int zi_t);
 // z <= beta * y + alpha * A * x, where A is symmetric and only the lower triangular patr of A is accessed
-void blasfeo_ref_dsymv_l(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, double beta, struct blasfeo_dvec *sy, int yi, struct blasfeo_dvec *sz, int zi);
+void blasfeo_ref_dsymv_l(int m, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, double beta, struct blasfeo_dvec *sy, int yi, struct blasfeo_dvec *sz, int zi);
+void blasfeo_ref_dsymv_l_mn(int m, int n, double alpha, struct blasfeo_dmat *sA, int ai, int aj, struct blasfeo_dvec *sx, int xi, double beta, struct blasfeo_dvec *sy, int yi, struct blasfeo_dvec *sz, int zi);
 
 // diagonal
 
diff --git a/third_party/acados/include/blasfeo/include/blasfeo_d_kernel.h b/third_party/acados/include/blasfeo/include/blasfeo_d_kernel.h
index bf110c2bf..709485769 100644
--- a/third_party/acados/include/blasfeo/include/blasfeo_d_kernel.h
+++ b/third_party/acados/include/blasfeo/include/blasfeo_d_kernel.h
@@ -54,6 +54,158 @@ void blasfeo_align_4096_byte(void *ptr, void **ptr_align);
 void blasfeo_align_64_byte(void *ptr, void **ptr_align);
 
 
+//
+// lib8
+//
+
+// 24x8
+void kernel_dgemm_nt_24x8_lib8(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dgemm_nt_24x8_vs_lib8(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int m1, int n1); //
+void kernel_dtrsm_nt_rl_inv_24x8_lib8(int k, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E); //
+void kernel_dpotrf_nt_l_24x8_lib8(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dpotrf_nt_l_24x8_vs_lib8(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int m1, int n1);
+void kernel_dtrsm_nt_rl_inv_24x8_vs_lib8(int k, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int m1, int n1); //
+void kernel_dgemm_dtrsm_nt_rl_inv_24x8_lib8(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dgemm_dtrsm_nt_rl_inv_24x8_vs_lib8(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int m1, int n1);
+void kernel_dsyrk_dpotrf_nt_l_24x8_lib8(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dsyrk_dpotrf_nt_l_24x8_vs_lib8(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int m1, int n1);
+void kernel_dlarfb8_rn_24_lib8(int kmax, double *pV, double *pT, double *pD, int sdd);
+// 16x8
+void kernel_dgemm_nt_16x8_lib8(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dgemm_nt_16x8_vs_lib8(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int m1, int n1); //
+void kernel_dgemm_nt_16x8_gen_lib8(int k, double *alpha, double *A, int sda, double *B, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1); //
+void kernel_dgemm_nn_16x8_lib8(int k, double *alpha, double *A, int sda, int offB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dgemm_nn_16x8_vs_lib8(int k, double *alpha, double *A, int sda, int offB, double *B, int sdb, double *beta, double *C, int sdc, double *D, int sdd, int m1, int n1); //
+void kernel_dgemm_nn_16x8_gen_lib8(int k, double *alpha, double *A, int sda, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1); //
+void kernel_dsyrk_nt_l_16x8_lib8(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd); //
+void kernel_dsyrk_nt_l_16x8_vs_lib8(int k, double *alpha, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, int m1, int n1); //
+void kernel_dsyrk_nt_l_16x8_gen_lib8(int k, double *alpha, double *A, int sda, double *B, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1); //
+void kernel_dtrmm_nn_rl_16x8_lib8(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd);
+void kernel_dtrmm_nn_rl_16x8_vs_lib8(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, double *D, int sdd, int m1, int n1);
+void kernel_dtrmm_nn_rl_16x8_gen_lib8(int k, double *alpha, double *A, int sda, int offsetB, double *B, int sdb, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_dtrsm_nt_rl_inv_16x8_lib8(int k, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E); //
+void kernel_dtrsm_nt_rl_inv_16x8_vs_lib8(int k, double *A, int sda, double *B, double *beta, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int m1, int n1); //
+void kernel_dpotrf_nt_l_16x8_lib8(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dpotrf_nt_l_16x8_vs_lib8(int k, double *A, int sda, double *B, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int m1, int n1);
+void kernel_dgemm_dtrsm_nt_rl_inv_16x8_lib8(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dgemm_dtrsm_nt_rl_inv_16x8_vs_lib8(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int m1, int n1);
+void kernel_dsyrk_dpotrf_nt_l_16x8_lib8(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dsyrk_dpotrf_nt_l_16x8_vs_lib8(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int m1, int n1);
+void kernel_dlarfb8_rn_16_lib8(int kmax, double *pV, double *pT, double *pD, int sdd);
+void kernel_dlarfb8_rn_la_16_lib8(int n1, double *pVA, double *pT, double *pD, int sdd, double *pA, int sda);
+void kernel_dlarfb8_rn_lla_16_lib8(int n0, int n1, double *pVL, double *pVA, double *pT, double *pD, int sdd, double *pL, int sdl, double *pA, int sda);
+// 8x16
+void kernel_dgemm_tt_8x16_lib8(int k, double *alpha, int offA, double *A, int sda, double *B, int sdb, double *beta, double *C, double *D); //
+void kernel_dgemm_tt_8x16_vs_lib8(int k, double *alpha, int offA, double *A, int sda, double *B, int sdb, double *beta, double *C, double *D, int m1, int n1); //
+void kernel_dgemm_tt_8x16_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1); //
+void kernel_dgemm_nt_8x16_lib8(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D); //
+void kernel_dgemm_nt_8x16_vs_lib8(int k, double *alpha, double *A, double *B, int sdb, double *beta, double *C, double *D, int m1, int n1); //
+// 8x8
+void kernel_dgemm_nt_8x8_lib8(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D); //
+void kernel_dgemm_nt_8x8_vs_lib8(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int m1, int n1); //
+void kernel_dgemm_nt_8x8_gen_lib8(int k, double *alpha, double *A, double *B, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1); //
+void kernel_dgemm_nn_8x8_lib8(int k, double *alpha, double *A, int offB, double *B, int sdb, double *beta, double *C, double *D); //
+void kernel_dgemm_nn_8x8_vs_lib8(int k, double *alpha, double *A, int offB, double *B, int sdb, double *beta, double *C, double *D, int m1, int n1); //
+void kernel_dgemm_nn_8x8_gen_lib8(int k, double *alpha, double *A, int offB, double *B, int sdb, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1); //
+void kernel_dgemm_tt_8x8_lib8(int k, double *alpha, int offA, double *A, int sda, double *B, double *beta, double *C, double *D); //
+void kernel_dgemm_tt_8x8_vs_lib8(int k, double *alpha, int offA, double *A, int sda, double *B, double *beta, double *C, double *D, int m1, int n1); //
+void kernel_dgemm_tt_8x8_gen_lib8(int k, double *alpha, int offA, double *A, int sda, double *B, double *beta, int offc, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1); //
+void kernel_dsyrk_nt_l_8x8_lib8(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D); //
+void kernel_dsyrk_nt_l_8x8_vs_lib8(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int m1, int n1); //
+void kernel_dsyrk_nt_l_8x8_gen_lib8(int k, double *alpha, double *A, double *B, double *beta, int offC, double *C, int sdc, int offD, double *D, int sdd, int m0, int m1, int n0, int n1); //
+void kernel_dtrmm_nn_rl_8x8_lib8(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D);
+void kernel_dtrmm_nn_rl_8x8_vs_lib8(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *D, int m1, int n1);
+void kernel_dtrmm_nn_rl_8x8_gen_lib8(int k, double *alpha, double *A, int offsetB, double *B, int sdb, int offD, double *D, int sdd, int m0, int m1, int n0, int n1);
+void kernel_dtrsm_nt_rl_inv_8x8_lib8(int k, double *A, double *B, double *beta, double *C, double *D, double *E, double *inv_diag_E);
+void kernel_dtrsm_nt_rl_inv_8x8_vs_lib8(int k, double *A, double *B, double *beta, double *C, double *D, double *E, double *inv_diag_E, int m1, int n1);
+void kernel_dpotrf_nt_l_8x8_lib8(int k, double *A, double *B, double *C, double *D, double *inv_diag_D);
+void kernel_dpotrf_nt_l_8x8_vs_lib8(int k, double *A, double *B, double *C, double *D, double *inv_diag_D, int m1, int n1);
+void kernel_dgemm_dtrsm_nt_rl_inv_8x8_lib8(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
+void kernel_dgemm_dtrsm_nt_rl_inv_8x8_vs_lib8(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int m1, int n1);
+void kernel_dsyrk_dpotrf_nt_l_8x8_lib8(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+void kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib8(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int m1, int n1);
+void kernel_dgelqf_vs_lib8(int m, int n, int k, int offD, double *pD, int sdd, double *dD);
+void kernel_dgelqf_pd_vs_lib8(int m, int n, int k, int offD, double *pD, int sdd, double *dD);
+void kernel_dgelqf_8_lib8(int kmax, double *pD, double *dD);
+void kernel_dgelqf_pd_8_lib8(int kmax, double *pD, double *dD);
+void kernel_dlarft_8_lib8(int kmax, double *pD, double *dD, double *pT);
+void kernel_dlarfb8_rn_8_lib8(int kmax, double *pV, double *pT, double *pD);
+void kernel_dlarfb8_rn_8_vs_lib8(int kmax, double *pV, double *pT, double *pD, int m1);
+void kernel_dlarfb8_rn_1_lib8(int kmax, double *pV, double *pT, double *pD);
+void kernel_dgelqf_dlarft8_8_lib8(int kmax, double *pD, double *dD, double *pT);
+void kernel_dgelqf_pd_dlarft8_8_lib8(int kmax, double *pD, double *dD, double *pT);
+void kernel_dgelqf_pd_la_vs_lib8(int m, int n1, int k, int offD, double *pD, int sdd, double *dD, int offA, double *pA, int sda);
+void kernel_dgelqf_pd_la_dlarft8_8_lib8(int kmax, double *pD, double *dD, double *pA, double *pT);
+void kernel_dlarft_la_8_lib8(int n1, double *dD, double *pA, double *pT);
+void kernel_dlarfb8_rn_la_8_lib8(int n1, double *pVA, double *pT, double *pD, double *pA);
+void kernel_dlarfb8_rn_la_8_vs_lib8(int n1, double *pVA, double *pT, double *pD, double *pA, int m1);
+void kernel_dlarfb8_rn_la_1_lib8(int n1, double *pVA, double *pT, double *pD, double *pA);
+void kernel_dgelqf_pd_lla_vs_lib8(int m, int n0, int n1, int k, int offD, double *pD, int sdd, double *dD, int offL, double *pL, int sdl, int offA, double *pA, int sda);
+void kernel_dgelqf_pd_lla_dlarft8_8_lib8(int n0, int n1, double *pD, double *dD, double *pL, double *pA, double *pT);
+void kernel_dlarft_lla_8_lib8(int n0, int n1, double *dD, double *pL, double *pA, double *pT);
+void kernel_dlarfb8_rn_lla_8_lib8(int n0, int n1, double *pVL, double *pVA, double *pT, double *pD, double *pL, double *pA);
+void kernel_dlarfb8_rn_lla_8_vs_lib8(int n0, int n1, double *pVL, double *pVA, double *pT, double *pD, double *pL, double *pA, int m1);
+void kernel_dlarfb8_rn_lla_1_lib8(int n0, int n1, double *pVL, double *pVA, double *pT, double *pD, double *pL, double *pA);
+
+// panel copy / pack
+// 24
+void kernel_dpack_nn_24_lib8(int kmax, double *A, int lda, double *C, int sdc);
+void kernel_dpack_nn_24_vs_lib8(int kmax, double *A, int lda, double *C, int sdc, int m1);
+// 16
+void kernel_dpacp_nn_16_lib8(int kmax, int offsetA, double *A, int sda, double *B, int sdb);
+void kernel_dpacp_nn_16_vs_lib8(int kmax, int offsetA, double *A, int sda, double *B, int sdb, int m1);
+void kernel_dpack_nn_16_lib8(int kmax, double *A, int lda, double *C, int sdc);
+void kernel_dpack_nn_16_vs_lib8(int kmax, double *A, int lda, double *C, int sdc, int m1);
+// 8
+void kernel_dpacp_nn_8_lib8(int kmax, int offsetA, double *A, int sda, double *B);
+void kernel_dpacp_nn_8_vs_lib8(int kmax, int offsetA, double *A, int sda, double *B, int m1);
+void kernel_dpacp_tn_8_lib8(int kmax, int offsetA, double *A, int sda, double *B);
+void kernel_dpacp_tn_8_vs_lib8(int kmax, int offsetA, double *A, int sda, double *B, int m1);
+void kernel_dpacp_l_nn_8_lib8(int kmax, int offsetA, double *A, int sda, double *B);
+void kernel_dpacp_l_nn_8_vs_lib8(int kmax, int offsetA, double *A, int sda, double *B, int m1);
+void kernel_dpacp_l_tn_8_lib8(int kmax, int offsetA, double *A, int sda, double *B);
+void kernel_dpacp_l_tn_8_vs_lib8(int kmax, int offsetA, double *A, int sda, double *B, int m1);
+void kernel_dpaad_nn_8_lib8(int kmax, double *alpha, int offsetA, double *A, int sda, double *B);
+void kernel_dpaad_nn_8_vs_lib8(int kmax, double *alpha, int offsetA, double *A, int sda, double *B, int m1);
+void kernel_dpack_nn_8_lib8(int kmax, double *A, int lda, double *C);
+void kernel_dpack_nn_8_vs_lib8(int kmax, double *A, int lda, double *C, int m1);
+void kernel_dpack_tn_8_lib8(int kmax, double *A, int lda, double *C);
+void kernel_dpack_tn_8_vs_lib8(int kmax, double *A, int lda, double *C, int m1);
+// 4
+void kernel_dpack_tt_4_lib8(int kmax, double *A, int lda, double *C, int sdc); // TODO offsetC 
+void kernel_dpack_tt_4_vs_lib8(int kmax, double *A, int lda, double *C, int sdc, int m1); // TODO offsetC 
+
+// level 2 BLAS
+// 16
+void kernel_dgemv_n_16_lib8(int k, double *alpha, double *A, int sda, double *x, double *beta, double *y, double *z);
+// 8
+void kernel_dgemv_n_8_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z);
+void kernel_dgemv_n_8_vs_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int m1);
+//void kernel_dgemv_n_8_gen_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int m0, int m1);
+void kernel_dgemv_n_8_gen_lib8(int k, double *alpha, int offsetA, double *A, double *x, double *beta, double *y, double *z, int m1);
+void kernel_dgemv_t_8_lib8(int k, double *alpha, int offsetA, double *A, int sda, double *x, double *beta, double *y, double *z);
+void kernel_dgemv_t_8_vs_lib8(int k, double *alpha, int offsetA, double *A, int sda, double *x, double *beta, double *y, double *z, int n1);
+void kernel_dgemv_nt_8_lib8(int kmax, double *alpha_n, double *alpha_t, int offsetA, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t);
+void kernel_dgemv_nt_8_vs_lib8(int kmax, double *alpha_n, double *alpha_t, int offsetA, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int n1);
+void kernel_dsymv_l_8_lib8(int kmax, double *alpha, double *A, int sda, double *x, double *z);
+void kernel_dsymv_l_8_vs_lib8(int kmax, double *alpha, double *A, int sda, double *x, double *z, int n1);
+void kernel_dsymv_l_8_gen_lib8(int kmax, double *alpha, int offsetA, double *A, int sda, double *x, double *z, int n1);
+void kernel_dtrmv_n_ln_8_lib8(int k, double *A, double *x, double *z);
+void kernel_dtrmv_n_ln_8_vs_lib8(int k, double *A, double *x, double *z, int m1);
+void kernel_dtrmv_n_ln_8_gen_lib8(int k, int offsetA, double *A, double *x, double *z, int m1);
+void kernel_dtrmv_t_ln_8_lib8(int k, double *A, int sda, double *x, double *z);
+void kernel_dtrmv_t_ln_8_vs_lib8(int k, double *A, int sda, double *x, double *z, int n1);
+void kernel_dtrmv_t_ln_8_gen_lib8(int k, int offsetA, double *A, int sda, double *x, double *z, int n1);
+void kernel_dtrsv_n_l_inv_8_lib8(int k, double *A, double *inv_diag_A, double *x, double *z);
+void kernel_dtrsv_n_l_inv_8_vs_lib8(int k, double *A, double *inv_diag_A, double *x, double *z, int m1, int n1);
+void kernel_dtrsv_t_l_inv_8_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *z);
+void kernel_dtrsv_t_l_inv_8_vs_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *z, int m1, int n1);
+
+
+
+//
+// lib4
+//
 
 // level 2 BLAS
 // 12
@@ -413,10 +565,10 @@ void kernel_drowsw_lib4(int kmax, double *pA, double *pC);
 
 // merged routines
 // 12x4
-void kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
 void kernel_dgemm_dtrsm_nt_rl_inv_12x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
-void kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
+void kernel_dgemm_dtrsm_nt_rl_inv_12x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
 void kernel_dsyrk_dpotrf_nt_l_12x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
+void kernel_dsyrk_dpotrf_nt_l_12x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
 // 4x12
 void kernel_dgemm_dtrsm_nt_rl_inv_4x12_vs_lib4(int kp, double *Ap, double *Bp, int sdbp, int km_, double *Am, double *Bm, int sdbm, double *C, double *D, double *E, int sde, double *inv_diag_E, int km, int kn);
 // 8x8
@@ -425,8 +577,8 @@ void kernel_dsyrk_dpotrf_nt_l_8x8_vs_lib4(int kp, double *Ap, int sdap, double *
 void kernel_dgemm_dtrsm_nt_rl_inv_8x8l_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdb, int km_, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
 void kernel_dgemm_dtrsm_nt_rl_inv_8x8u_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int sdb, int km_, double *Am, int sdam, double *Bm, int sdbm, double *C, int sdc, double *D, int sdd, double *E, int sde, double *inv_diag_E, int km, int kn);
 // 8x4
-void kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
 void kernel_dgemm_dtrsm_nt_rl_inv_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E);
+void kernel_dgemm_dtrsm_nt_rl_inv_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *E, double *inv_diag_E, int km, int kn);
 void kernel_dsyrk_dpotrf_nt_l_8x4_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D);
 void kernel_dsyrk_dpotrf_nt_l_8x4_vs_lib4(int kp, double *Ap, int sdap, double *Bp, int km_, double *Am, int sdam, double *Bm, double *C, int sdc, double *D, int sdd, double *inv_diag_D, int km, int kn);
 // 4x8
@@ -434,16 +586,16 @@ void kernel_dgemm_dtrsm_nt_rl_inv_4x8_vs_lib4(int kp, double *Ap, double *Bp, in
 // 4x4
 void kernel_dgemm_dtrsm_nt_rl_inv_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
 void kernel_dgemm_dtrsm_nt_rl_inv_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
-void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
 void kernel_dsyrk_dpotrf_nt_l_4x4_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+void kernel_dsyrk_dpotrf_nt_l_4x4_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
 // 4x2
 void kernel_dgemm_dtrsm_nt_rl_inv_4x2_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E);
 void kernel_dgemm_dtrsm_nt_rl_inv_4x2_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *E, double *inv_diag_E, int km, int kn);
-void kernel_dsyrk_dpotrf_nt_l_4x2_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
 void kernel_dsyrk_dpotrf_nt_l_4x2_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+void kernel_dsyrk_dpotrf_nt_l_4x2_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
 // 2x2
-void kernel_dsyrk_dpotrf_nt_l_2x2_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
 void kernel_dsyrk_dpotrf_nt_l_2x2_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D);
+void kernel_dsyrk_dpotrf_nt_l_2x2_vs_lib4(int kp, double *Ap, double *Bp, int km_, double *Am, double *Bm, double *C, double *D, double *inv_diag_D, int km, int kn);
 
 /*
  *
@@ -1034,6 +1186,53 @@ void kernel_dgemm_nt_8xn_p0_lib44cc(int n, int k, double *alpha, double *A, int
 
 
 
+// A, B panel-major bs=8; C, D column-major
+// 24x8
+void kernel_dgemm_nt_24x8_lib88cc(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_nt_24x8_vs_lib88cc(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+// 16x8
+void kernel_dgemm_nt_16x8_lib88cc(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_nt_16x8_vs_lib88cc(int kmax, double *alpha, double *A, int sda, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+// 8x8
+void kernel_dgemm_nt_8x8_lib88cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_nt_8x8_vs_lib88cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+
+// A, panel-major bs=8; B, C, D column-major
+// 24x8
+void kernel_dgemm_nt_24x8_lib8ccc(int kmax, double *alpha, double *A, int sda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_nt_24x8_vs_lib8ccc(int kmax, double *alpha, double *A, int sda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+void kernel_dgemm_nn_24x8_lib8ccc(int kmax, double *alpha, double *A, int sda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_nn_24x8_vs_lib8ccc(int kmax, double *alpha, double *A, int sda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+// 16x8
+void kernel_dgemm_nt_16x8_lib8ccc(int kmax, double *alpha, double *A, int sda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_nt_16x8_vs_lib8ccc(int kmax, double *alpha, double *A, int sda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+void kernel_dgemm_nn_16x8_lib8ccc(int kmax, double *alpha, double *A, int sda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_nn_16x8_vs_lib8ccc(int kmax, double *alpha, double *A, int sda, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+// 8x8
+void kernel_dgemm_nt_8x8_lib8ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_nt_8x8_vs_lib8ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+void kernel_dgemm_nn_8x8_lib8ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_nn_8x8_vs_lib8ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+
+// B, panel-major bs=8; A, C, D column-major
+// 8x24
+void kernel_dgemm_nt_8x24_libc8cc(int kmax, double *alpha, double *A, int lda, double *B, int sdb, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_nt_8x24_vs_libc8cc(int kmax, double *alpha, double *A, int lda, double *B, int sdb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+void kernel_dgemm_tt_8x24_libc8cc(int kmax, double *alpha, double *A, int lda, double *B, int sdb, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_tt_8x24_vs_libc8cc(int kmax, double *alpha, double *A, int lda, double *B, int sdb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+// 8x16
+void kernel_dgemm_nt_8x16_libc8cc(int kmax, double *alpha, double *A, int lda, double *B, int sdb, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_nt_8x16_vs_libc8cc(int kmax, double *alpha, double *A, int lda, double *B, int sdb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+void kernel_dgemm_tt_8x16_libc8cc(int kmax, double *alpha, double *A, int lda, double *B, int sdb, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_tt_8x16_vs_libc8cc(int kmax, double *alpha, double *A, int lda, double *B, int sdb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+// 8x8
+void kernel_dgemm_nt_8x8_libc8cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_nt_8x8_vs_libc8cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+void kernel_dgemm_tt_8x8_libc8cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd);
+void kernel_dgemm_tt_8x8_vs_libc8cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1);
+
+
+
 // aux
 void kernel_dvecld_inc1(int kmax, double *x);
 void kernel_dveccp_inc1(int kmax, double *x, double *y);
diff --git a/third_party/acados/include/blasfeo/include/blasfeo_s_blasfeo_api.h b/third_party/acados/include/blasfeo/include/blasfeo_s_blasfeo_api.h
index 72c19dcdc..24b21ead2 100644
--- a/third_party/acados/include/blasfeo/include/blasfeo_s_blasfeo_api.h
+++ b/third_party/acados/include/blasfeo/include/blasfeo_s_blasfeo_api.h
@@ -97,14 +97,18 @@ void blasfeo_strsv_ltu(int m, struct blasfeo_smat *sA, int ai, int aj, struct bl
 void blasfeo_strsv_unn(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
 // z <= inv( A' ) * x, A (m)x(m) upper, transposed, not_unit
 void blasfeo_strsv_utn(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
+// z <= A * x ; A lower triangular
+void blasfeo_strmv_lnn(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
+// z <= A * x ; A lower triangular, unit diagonal
+void blasfeo_strmv_lnu(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
+// z <= A' * x ; A lower triangular
+void blasfeo_strmv_ltn(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
+// z <= A' * x ; A lower triangular, unit diagonal
+void blasfeo_strmv_ltu(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
 // z <= beta * y + alpha * A * x ; A upper triangular
 void blasfeo_strmv_unn(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
 // z <= A' * x ; A upper triangular
 void blasfeo_strmv_utn(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
-// z <= A * x ; A lower triangular
-void blasfeo_strmv_lnn(int m, int n, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
-// z <= A' * x ; A lower triangular
-void blasfeo_strmv_ltn(int m, int n, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
 // z_n <= beta_n * y_n + alpha_n * A  * x_n
 // z_t <= beta_t * y_t + alpha_t * A' * x_t
 void blasfeo_sgemv_nt(int m, int n, float alpha_n, float alpha_t, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx_n, int xi_n, struct blasfeo_svec *sx_t, int xi_t, float beta_n, float beta_t, struct blasfeo_svec *sy_n, int yi_n, struct blasfeo_svec *sy_t, int yi_t, struct blasfeo_svec *sz_n, int zi_n, struct blasfeo_svec *sz_t, int zi_t);
diff --git a/third_party/acados/include/blasfeo/include/blasfeo_s_blasfeo_ref_api.h b/third_party/acados/include/blasfeo/include/blasfeo_s_blasfeo_ref_api.h
index db8b28d3b..805914cc0 100644
--- a/third_party/acados/include/blasfeo/include/blasfeo_s_blasfeo_ref_api.h
+++ b/third_party/acados/include/blasfeo/include/blasfeo_s_blasfeo_ref_api.h
@@ -97,19 +97,24 @@ void blasfeo_ref_strsv_ltu(int m, struct blasfeo_smat *sA, int ai, int aj, struc
 void blasfeo_ref_strsv_unn(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
 // z <= inv( A' ) * x, A (m)x(m) upper, transposed, not_unit
 void blasfeo_ref_strsv_utn(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
+// z <= A * x ; A lower triangular
+void blasfeo_ref_strmv_lnn(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
+// z <= A * x ; A lower triangular, unit diagonal
+void blasfeo_ref_strmv_lnu(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
+// z <= A' * x ; A lower triangular
+void blasfeo_ref_strmv_ltn(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
+// z <= A' * x ; A lower triangular, unit diagonal
+void blasfeo_ref_strmv_ltu(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
 // z <= beta * y + alpha * A * x ; A upper triangular
 void blasfeo_ref_strmv_unn(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
 // z <= A' * x ; A upper triangular
 void blasfeo_ref_strmv_utn(int m, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
-// z <= A * x ; A lower triangular
-void blasfeo_ref_strmv_lnn(int m, int n, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
-// z <= A' * x ; A lower triangular
-void blasfeo_ref_strmv_ltn(int m, int n, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, struct blasfeo_svec *sz, int zi);
 // z_n <= beta_n * y_n + alpha_n * A  * x_n
 // z_t <= beta_t * y_t + alpha_t * A' * x_t
 void blasfeo_ref_sgemv_nt(int m, int n, float alpha_n, float alpha_t, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx_n, int xi_n, struct blasfeo_svec *sx_t, int xi_t, float beta_n, float beta_t, struct blasfeo_svec *sy_n, int yi_n, struct blasfeo_svec *sy_t, int yi_t, struct blasfeo_svec *sz_n, int zi_n, struct blasfeo_svec *sz_t, int zi_t);
 // z <= beta * y + alpha * A * x, where A is symmetric and only the lower triangular patr of A is accessed
-void blasfeo_ref_ssymv_l(int m, int n, float alpha, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, float beta, struct blasfeo_svec *sy, int yi, struct blasfeo_svec *sz, int zi);
+void blasfeo_ref_ssymv_l(int m, float alpha, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, float beta, struct blasfeo_svec *sy, int yi, struct blasfeo_svec *sz, int zi);
+void blasfeo_ref_ssymv_l_mn(int m, int n, float alpha, struct blasfeo_smat *sA, int ai, int aj, struct blasfeo_svec *sx, int xi, float beta, struct blasfeo_svec *sy, int yi, struct blasfeo_svec *sz, int zi);
 
 // diagonal
 
diff --git a/third_party/acados/include/blasfeo/include/blasfeo_s_kernel.h b/third_party/acados/include/blasfeo/include/blasfeo_s_kernel.h
index dcbde989c..99d2b28c8 100644
--- a/third_party/acados/include/blasfeo/include/blasfeo_s_kernel.h
+++ b/third_party/acados/include/blasfeo/include/blasfeo_s_kernel.h
@@ -550,6 +550,16 @@ void kernel_spotrf_nt_l_4x4_lib44cc(int kmax, float *A, float *B, float *C, int
 void kernel_spotrf_nt_l_4x4_vs_lib44cc(int kmax, float *A, float *B, float *C, int ldc, float *D, int ldd, float *dD, int m1, int n1);
 
 // B panel-major bs=8; A, C, D column-major
+// 4x24
+void kernel_sgemm_nt_4x24_libc8cc(int kmax, float *alpha, float *A, int lda, float *B, int sdb, float *beta, float *C, int ldc, float *D, int ldd);
+void kernel_sgemm_nt_4x24_vs_libc8cc(int kmax, float *alpha, float *A, int lda, float *B, int sdb, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1);
+void kernel_sgemm_tt_4x24_libc8cc(int kmax, float *alpha, float *A, int lda, float *B, int sdb, float *beta, float *C, int ldc, float *D, int ldd);
+void kernel_sgemm_tt_4x24_vs_libc8cc(int kmax, float *alpha, float *A, int lda, float *B, int sdb, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1);
+// 4x16
+void kernel_sgemm_nt_4x16_libc8cc(int kmax, float *alpha, float *A, int lda, float *B, int sdb, float *beta, float *C, int ldc, float *D, int ldd);
+void kernel_sgemm_nt_4x16_vs_libc8cc(int kmax, float *alpha, float *A, int lda, float *B, int sdb, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1);
+void kernel_sgemm_tt_4x16_libc8cc(int kmax, float *alpha, float *A, int lda, float *B, int sdb, float *beta, float *C, int ldc, float *D, int ldd);
+void kernel_sgemm_tt_4x16_vs_libc8cc(int kmax, float *alpha, float *A, int lda, float *B, int sdb, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1);
 // 8x8
 void kernel_sgemm_nt_8x8_libc8cc(int kmax, float *alpha, float *A, int lda, float *B, float *beta, float *C, int ldc, float *D, int ldd);
 void kernel_sgemm_nt_8x8_vs_libc8cc(int kmax, float *alpha, float *A, int lda, float *B, float *beta, float *C, int ldc, float *D, int ldd, int m1, int n1);
diff --git a/third_party/acados/larch64/lib/libacados.so b/third_party/acados/larch64/lib/libacados.so
index 1716e9ad6..1f2b1f5c0 100644
Binary files a/third_party/acados/larch64/lib/libacados.so and b/third_party/acados/larch64/lib/libacados.so differ
diff --git a/third_party/acados/larch64/lib/libblasfeo.so b/third_party/acados/larch64/lib/libblasfeo.so
index a48afdd8c..790282fa1 100644
Binary files a/third_party/acados/larch64/lib/libblasfeo.so and b/third_party/acados/larch64/lib/libblasfeo.so differ
diff --git a/third_party/acados/larch64/lib/libhpipm.so b/third_party/acados/larch64/lib/libhpipm.so
index c9725f868..bebd5e891 100644
Binary files a/third_party/acados/larch64/lib/libhpipm.so and b/third_party/acados/larch64/lib/libhpipm.so differ
diff --git a/third_party/acados/x86_64/lib/libacados.so b/third_party/acados/x86_64/lib/libacados.so
index 5a1475426..00a5e2371 100644
Binary files a/third_party/acados/x86_64/lib/libacados.so and b/third_party/acados/x86_64/lib/libacados.so differ
diff --git a/third_party/acados/x86_64/lib/libblasfeo.so b/third_party/acados/x86_64/lib/libblasfeo.so
index 5eb0e988c..5a8399287 100644
Binary files a/third_party/acados/x86_64/lib/libblasfeo.so and b/third_party/acados/x86_64/lib/libblasfeo.so differ
diff --git a/third_party/acados/x86_64/lib/libhpipm.so b/third_party/acados/x86_64/lib/libhpipm.so
index a647b056f..f52091d93 100644
Binary files a/third_party/acados/x86_64/lib/libhpipm.so and b/third_party/acados/x86_64/lib/libhpipm.so differ