> compile NEMU

ysyx_22040000 李心杨 Linux calcite 6.6.19 #1-NixOS SMP PREEMPT_DYNAMIC Fri Mar 1 12:35:11 UTC 2024 x86_64 GNU/Linux 16:45:08 up 4 days 3:51, 2 users, load average: 1.13, 0.84, 0.84
> compile NEMU
2024-03-24 16:45:08 +08:00 · 2024-03-24 16:45:06 +08:00 · 2024-03-24 16:44:50 +08:00 · 2024-03-24 16:44:49 +08:00 · 2024-03-24 16:28:09 +08:00 · 2024-03-24 16:27:18 +08:00
440 changed files with 45918 additions and 407 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,3 +10,5 @@
 !init.sh
 /fceux-am
 /nvboard
+/am-kernels
+**/.cache
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "am-kernels"]
+	path = am-kernels
+	url = ./am-kernels/
--- a/abstract-machine/.gitignore
+++ b/abstract-machine/.gitignore
@ -1,19 +1,6 @@
-*
-!*/
-!*.h
-!*.c
-!*.cc
-!*.S
-!*.ld
-!*.sh
-!*.py
-!*.mk
-!Makefile
-!README
-!LICENSE
-.*
-_*
-*~
-build/
-!.gitignore
-.vscode
+**/.direnv/
+**/build/
+**/.envrc
+**/.cache
+.vscode
+compile_commands.json
--- a/abstract-machine/CMakeLists.txt
+++ b/abstract-machine/CMakeLists.txt
@ -0,0 +1,87 @@
+cmake_minimum_required(VERSION 3.22)
+
+project(abstract-machine)
+enable_language(CXX C ASM)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 11)
+
+include(CMakeDependentOption)
+include(CMakePackageConfigHelpers)  # Used to find libcheck
+include(CTest)
+
+# -- General options
+set(ISA CACHE STRING "Target ISA")
+set_property(CACHE ISA PROPERTY STRINGS "riscv" "x86" "x86_64" "native")
+string(TOUPPER ${ISA} ISA_UPPER)
+
+cmake_dependent_option(
+    __PLATFORM_NEMU__ "Run on NEMU"
+    ON "ISA MATCHES \"(riscv | x86)\"" OFF)
+cmake_dependent_option(
+    __PLATFORM_NATIVE__ "Run on native"
+    ON "ISA MATCHES native" OFF)
+
+# -- Set PLATFORM according to options
+set(MATCH_PLATFORM_PATTERN "^__PLATFORM_([A-Z]*)__")
+get_cmake_property(CACHE_VARS CACHE_VARIABLES)
+
+message(STATUS "ISA: ${ISA}")
+foreach(VAR IN LISTS CACHE_VARS)
+    if(VAR MATCHES ${MATCH_PLATFORM_PATTERN})
+        # Retrieve the value of the cache variable
+        get_property(VAR_VALUE CACHE ${VAR} PROPERTY VALUE)
+        set(PLATFORM_UPPER ${CMAKE_MATCH_1})
+        string(TOLOWER ${PLATFORM_UPPER} PLATFORM)
+        message(STATUS "Variable: ${VAR}=${VAR_VALUE}, Platform: ${PLATFORM}")
+    endif()
+endforeach()
+
+if(${PLATFORM} MATCHES "native")
+set(ARCH "native")
+else()
+set(ARCH ${ISA}-${PLATFORM})
+endif()
+string(TOUPPER ${ARCH} ARCH_UPPER)
+
+# -- Target specific options
+cmake_dependent_option(
+    NATIVE_USE_KLIB "Use Klib even if on native"
+    ON "NOT __ISA_NATIVE__" OFF)
+
+# -- Add compile definitions based on options
+add_compile_definitions(
+    $<MAKE_C_IDENTIFIER:__ARCH_${ARCH_UPPER}__>
+    __ISA_${ISA_UPPER}__
+    __PLATFORM_${PLATFORM_UPPER}__
+)
+
+add_compile_definitions(
+    $<$<BOOL:${NATIVE_USE_KLIB}>:__NATIVE_USE_KLIB__>
+)
+
+# -- Required compiler flags
+add_compile_options(
+    # -Werror
+    -Wno-main
+    -fno-asynchronous-unwind-tables
+    -fno-builtin
+    -fno-stack-protector
+    -U_FORTIFY_SOURCE
+    $<$<COMPILE_LANGUAGE:CXX>:-fno-exceptions>
+    $<$<COMPILE_LANGUAGE:CXX>:-ffreestanding>
+    $<$<COMPILE_LANGUAGE:CXX>:-fno-rtti>)
+
+add_link_options(
+    -znoexecstack
+)
+
+# -- Include linker script here. Use this linker script at link time if INCLUDE_LINKER_SCRIPT is set to true
+set(LINKER_SCRIPT linker.ld)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
+
+add_compile_options(-march=rv32if -mabi=ilp32)
+add_link_options(-march=rv32if -mabi=ilp32)
+
+add_subdirectory(klib)
+add_subdirectory(am)
--- a/abstract-machine/CMakePresets.json
+++ b/abstract-machine/CMakePresets.json
@ -0,0 +1,29 @@
+{
+    "version": 6,
+    "configurePresets": [
+        {
+            "name": "native",
+            "displayName": "Native",
+            "generator": "Unix Makefiles",
+            "binaryDir": "${sourceDir}/out/build/${presetName}",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Debug",
+                "ISA": "native",
+                "__PLATFORM_NATIVE__": true,
+                "NATIVE_USE_KLIB": true
+            }
+        },
+        {
+            "name": "riscv-nemu",
+            "displayName": "Riscv32 NEMU",
+            "generator": "Unix Makefiles",
+            "binaryDir": "${sourceDir}/out/build/${presetName}",
+            "installDir": "/home/xin/repo/ysyx-workbench/abstract-machine/out/install",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Debug",
+                "ISA": "riscv",
+                "__PLATFORM_NEMU__": true
+            }
+        }
+    ]
+}
--- a/abstract-machine/Makefile
+++ b/abstract-machine/Makefile
@ -47,33 +47,32 @@ endif

 ### Create the destination directory (`build/$ARCH`)
 WORK_DIR  = $(shell pwd)
-DST_DIR   = $(WORK_DIR)/build/$(ARCH)
+BUILD_DIR ?= $(WORK_DIR)/build
+DST_DIR   = $(BUILD_DIR)/$(ARCH)
 $(shell mkdir -p $(DST_DIR))

 ### Compilation targets (a binary image or archive)
-IMAGE_REL = build/$(NAME)-$(ARCH)
+IMAGE_REL = $(DST_DIR)/$(NAME)-$(ARCH)
 IMAGE     = $(abspath $(IMAGE_REL))
-ARCHIVE   = $(WORK_DIR)/build/$(NAME)-$(ARCH).a
+ARCHIVE   = $(BUILD_DIR)/$(NAME)-$(ARCH).a

 ### Collect the files to be linked: object files (`.o`) and libraries (`.a`)
 OBJS      = $(addprefix $(DST_DIR)/, $(addsuffix .o, $(basename $(SRCS))))
 LIBS     := $(sort $(LIBS) am klib) # lazy evaluation ("=") causes infinite recursions
 LINKAGE   = $(OBJS) \
-  $(addsuffix -$(ARCH).a, $(join \
-    $(addsuffix /build/, $(addprefix $(AM_HOME)/, $(LIBS))), \
-    $(LIBS) ))
+  $(addsuffix -$(ARCH).a, $(addprefix $(BUILD_DIR)/, $(LIBS)))

 ## 3. General Compilation Flags

 ### (Cross) compilers, e.g., mips-linux-gnu-g++
-AS        = $(CROSS_COMPILE)gcc
-CC        = $(CROSS_COMPILE)gcc
-CXX       = $(CROSS_COMPILE)g++
-LD        = $(CROSS_COMPILE)ld
-AR        = $(CROSS_COMPILE)ar
-OBJDUMP   = $(CROSS_COMPILE)objdump
-OBJCOPY   = $(CROSS_COMPILE)objcopy
-READELF   = $(CROSS_COMPILE)readelf
+AS        ?= $(CROSS_COMPILE)gcc
+CC        ?= $(CROSS_COMPILE)gcc
+CXX       ?= $(CROSS_COMPILE)g++
+LD        ?= $(CROSS_COMPILE)ld
+AR        ?= $(CROSS_COMPILE)ar
+OBJDUMP   ?= $(CROSS_COMPILE)objdump
+OBJCOPY   ?= $(CROSS_COMPILE)objcopy
+READELF   ?= $(CROSS_COMPILE)readelf

 ### Compilation flags
 INC_PATH += $(WORK_DIR)/include $(addsuffix /include/, $(addprefix $(AM_HOME)/, $(LIBS)))
--- a/abstract-machine/am/CMakeLists.txt
+++ b/abstract-machine/am/CMakeLists.txt
@ -0,0 +1,10 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+add_library(am_interface INTERFACE)
+target_include_directories(am_interface INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:include/abstract-machine>)
+
+add_subdirectory(src)
+
+install(DIRECTORY include/ DESTINATION include/abstract-machine)
--- a/abstract-machine/am/src/CMakeLists.txt
+++ b/abstract-machine/am/src/CMakeLists.txt
@ -0,0 +1,53 @@
+if(ISA MATCHES "native")
+set(SOURCEDIR "./${PLATFORM}")
+else()
+set(SOURCEDIR "./${ISA}/${PLATFORM}")
+endif()
+
+add_subdirectory(${SOURCEDIR})
+
+target_include_directories(am-${ARCH}
+    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}
+    PUBLIC
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
+      $<INSTALL_INTERFACE:include/abstract-machine>)
+target_link_libraries(am-${ARCH}
+    PUBLIC klib_interface
+    INTERFACE m)
+
+# TODO: Check
+target_link_options(am-${ARCH} INTERFACE
+    $<BUILD_INTERFACE:-T${CMAKE_SOURCE_DIR}/scripts/${LINKER_SCRIPT}>
+    $<INSTALL_INTERFACE:-T${CMAKE_INSTALL_LIBDIR}/cmake/am-${ARCH}/${LINKER_SCRIPT}>)
+
+# Interface compile flags
+target_link_options(am-${ARCH} INTERFACE
+        -znoexecstack)
+
+target_compile_options(am-${ARCH} INTERFACE
+        -fno-asynchronous-unwind-tables
+        -fno-builtin
+        -fno-stack-protector
+        -U_FORTIFY_SOURCE
+        $<$<COMPILE_LANGUAGE:CXX>:-fno-exceptions>
+        $<$<COMPILE_LANGUAGE:CXX>:-ffreestanding>
+        $<$<COMPILE_LANGUAGE:CXX>:-fno-rtti>)
+
+install(TARGETS am-${ARCH} klib_interface am_interface
+        EXPORT amTargets
+        LIBRARY DESTINATION lib)
+
+install(EXPORT amTargets 
+        FILE amTargets.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/am-${ARCH})
+
+configure_package_config_file(${CMAKE_SOURCE_DIR}/cmake/am-config.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/am-${ARCH}-config.cmake
+  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/am-${ARCH})
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/am-${ARCH}-config.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/am-${ARCH})
+
+# TODO: check
+install(FILES ${CMAKE_SOURCE_DIR}/scripts/${LINKER_SCRIPT}
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/am-${ARCH})
--- a/abstract-machine/am/src/native/CMakeLists.txt
+++ b/abstract-machine/am/src/native/CMakeLists.txt
@ -0,0 +1,26 @@
+include(CheckPIESupported)
+check_pie_supported()
+
+set(SOURCES
+    trap.S
+    cte.c
+    ioe.c
+    mpe.c
+    platform.c
+    trm.c
+    vme.c
+    ioe/audio.c
+    ioe/disk.c
+    ioe/gpu.c
+    ioe/input.c
+    ioe/timer.c
+)
+add_library(am-native ${SOURCES})
+
+# FIXME: get free(): invalid address when user program compiled without pie
+set_target_properties(am-native PROPERTIES
+    POSITION_INDEPENDENT_CODE TRUE
+    INTERFACE_POSITION_INDEPENDENT_CODE TRUE)
+
+find_package(SDL2 REQUIRED)
+target_link_libraries(am-${ARCH} PUBLIC SDL2::SDL2)
--- a/abstract-machine/am/src/riscv/nemu/CMakeLists.txt
+++ b/abstract-machine/am/src/riscv/nemu/CMakeLists.txt
@ -0,0 +1,34 @@
+include(nemu-settings)
+include(riscv-settings)
+
+add_library(am-${ISA}-nemu
+    cte.c
+    start.S
+    trap.S
+    vme.c
+    ${NEMU_SOURCES}
+)
+
+target_compile_options(am-${ISA}-nemu PRIVATE
+    ${NEMU_COMPILE_OPTIONS}
+    ${RISCV_COMPILE_OPTIONS})
+target_link_options(am-${ISA}-nemu PRIVATE
+    ${NEMU_LINK_OPITIONS}
+    ${RISCV_LINK_OPTIONS})
+target_include_directories(am-${ISA}-nemu PRIVATE
+    ${NEMU_INCLUDE_DIRECTORIES})
+target_link_options(am-${ISA}-nemu INTERFACE
+    LINKER:--defsym=_pmem_start=0x80000000
+    LINKER:--defsym=_entry_offset=0x0
+    LINKER:--gc-sections
+    LINKER:-e _start
+    -nostartfiles)
+
+target_compile_definitions(am-${ISA}-nemu PUBLIC
+    ARCH_H="arch/riscv.h")
+target_compile_definitions(am-${ISA}-nemu PRIVATE
+    ISA_H="riscv/riscv.h")
+
+set_target_properties(am-${ISA}-nemu PROPERTIES
+    POSITION_INDEPENDENT_CODE OFF
+    INTERFACE_POSITION_INDEPENDENT_CODE OFF)
--- a/abstract-machine/cmake/am-config.cmake.in
+++ b/abstract-machine/cmake/am-config.cmake.in
@ -0,0 +1,9 @@
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+if(${ARCH} MATCHES "native")
+find_dependency(SDL2 REQUIRED)
+endif()
+
+# Include the targets file
+include("${CMAKE_CURRENT_LIST_DIR}/amTargets.cmake")
--- a/abstract-machine/cmake/klib-config.cmake.in
+++ b/abstract-machine/cmake/klib-config.cmake.in
@ -0,0 +1,6 @@
+@PACKAGE_INIT@
+
+include(CMakeFindDependencyMacro)
+
+# Include the targets file
+include("${CMAKE_CURRENT_LIST_DIR}/klibTargets.cmake")
--- a/abstract-machine/cmake/nemu-settings.cmake
+++ b/abstract-machine/cmake/nemu-settings.cmake
@ -0,0 +1,11 @@
+set(NEMU_COMPILE_OPTIONS -fdata-sections -ffunction-sections)
+set(NEMU_LINK_OPTIONS
+    --defsym=_pmem_start=0x80000000
+    --defsym=_entry_offset=0x0
+    --gc-sections
+    -e _start)
+set(NEMU_INCLUDE_DIRECTORIES
+    ${CMAKE_SOURCE_DIR}/am/src/platform/nemu/include)
+file(GLOB_RECURSE NEMU_SOURCES
+    ${CMAKE_SOURCE_DIR}/am/src/platform/nemu/*.[cS])
+set(INCLUDE_LINKER_SCRIPT ON)
--- a/abstract-machine/cmake/riscv-settings.cmake
+++ b/abstract-machine/cmake/riscv-settings.cmake
@ -0,0 +1,2 @@
+set(RISCV_COMPILE_OPTIONS)
+set(RISCV_LINK_OPTIONS)
--- a/abstract-machine/klib/CMakeLists.txt
+++ b/abstract-machine/klib/CMakeLists.txt
@ -0,0 +1,12 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+add_library(klib_interface INTERFACE)
+target_include_directories(klib_interface
+    INTERFACE
+      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+      $<INSTALL_INTERFACE:include/abstract-machine>)
+
+add_subdirectory(src)
+# add_subdirectory(tests)
+
+install(DIRECTORY include/ DESTINATION include/abstract-machine)
--- a/abstract-machine/klib/include/klib.h
+++ b/abstract-machine/klib/include/klib.h
@ -35,6 +35,7 @@ int    atoi      (const char *nptr);
 int    printf    (const char *format, ...);
 int    sprintf   (char *str, const char *format, ...);
 int    snprintf  (char *str, size_t size, const char *format, ...);
+int    vprintf    (const char *format, va_list ap);
 int    vsprintf  (char *str, const char *format, va_list ap);
 int    vsnprintf (char *str, size_t size, const char *format, va_list ap);

--- a/abstract-machine/klib/result
+++ b/abstract-machine/klib/result
@ -0,0 +1 @@
+/nix/store/h1glxbcjgw3mv218w2wy73yih6s5p7iz-gdb-13.2
--- a/abstract-machine/klib/src/CMakeLists.txt
+++ b/abstract-machine/klib/src/CMakeLists.txt
@ -0,0 +1,33 @@
+# find_package(FLEX)
+# find_package(BISON)
+
+# FLEX_TARGET(fmt_scanner fmt_scanner.l fmt_scanner.c)
+
+set(SOURCES
+    cpp.c
+    int64.c
+    stdio.c
+    stdlib.c
+    string.c
+    # ${FLEX_fmt_scanner_OUTPUTS}
+)
+
+add_library(klib ${SOURCES})
+target_include_directories(klib PUBLIC $<TARGET_PROPERTY:am_interface,INTERFACE_INCLUDE_DIRECTORIES>)
+target_compile_definitions(klib PUBLIC $<TARGET_PROPERTY:am-${ARCH},INTERFACE_COMPILE_DEFINITIONS>)
+
+install(TARGETS klib
+        EXPORT klibTargets
+        LIBRARY DESTINATION lib)
+
+install(EXPORT klibTargets
+        FILE klibTargets.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/klib)
+
+configure_package_config_file(${CMAKE_SOURCE_DIR}/cmake/klib-config.cmake.in
+  ${CMAKE_CURRENT_BINARY_DIR}/klib-config.cmake
+  INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/klib)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/klib-config.cmake
+        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/klib)
+
--- a/abstract-machine/klib/src/stdio.c
+++ b/abstract-machine/klib/src/stdio.c
@ -5,8 +5,20 @@

 #if !defined(__ISA_NATIVE__) || defined(__NATIVE_USE_KLIB__)

+int vprintf(const char *fmt, va_list ap) {
+  const char *p = fmt;
+  while(*p != '\0') {
+    putch(*p);
+  }
+  return 0;
+}
+
 int printf(const char *fmt, ...) {
-  panic("Not implemented");
+  va_list args;
+  va_start(args, fmt);
+  vprintf(fmt, args);
+  va_end(args);
+  return 0;
 }

 int vsprintf(char *out, const char *fmt, va_list ap) {
--- a/abstract-machine/klib/src/string.c
+++ b/abstract-machine/klib/src/string.c
@ -5,43 +5,115 @@
 #if !defined(__ISA_NATIVE__) || defined(__NATIVE_USE_KLIB__)

 size_t strlen(const char *s) {
-  panic("Not implemented");
+  const char *p = s;
+  size_t len = 0;
+  while(*(p++) != '\0') len++;
+  return len;
 }

 char *strcpy(char *dst, const char *src) {
-  panic("Not implemented");
+  char *p_dst = dst;
+  const char *p_src = src;
+  for(; *p_src != '\0'; p_src++, p_dst++) {
+    *p_dst = *p_src;
+  }
+  *p_dst = '\0';
+  return dst;
 }

 char *strncpy(char *dst, const char *src, size_t n) {
-  panic("Not implemented");
+  int i = 0;
+  for(; i < n && src[i] != '\0'; i++) {
+    dst[i] = src[i];
+  }
+  for(; i < n; i++) {
+    dst[i] = '\0';
+  }
+  return dst;
 }

 char *strcat(char *dst, const char *src) {
-  panic("Not implemented");
+  char *p_dst = dst;
+  const char *p_src = src;
+  while(*p_dst != '\0') p_dst++;
+  for(; *p_src != '\0'; p_src++, p_dst++) {
+    *p_dst = *p_src;
+  }
+  *p_dst = '\0';
+  return dst;
 }

 int strcmp(const char *s1, const char *s2) {
-  panic("Not implemented");
+  const char *p_s1 = s1, *p_s2 = s2;
+  for(; *p_s1 == *p_s2; p_s1++, p_s2++) {
+    if(*p_s1 == '\0' || *p_s2 == '\0') {
+      break;
+    }
+  } 
+  return *p_s1 - *p_s2;
 }

 int strncmp(const char *s1, const char *s2, size_t n) {
-  panic("Not implemented");
+  const char *p_s1 = s1, *p_s2 = s2;
+  int i = 0;
+  for(i = 0; i < n - 1; i++) {
+    if(s1[i] == '\0' || s2[i] == '\0')
+      break;
+  } 
+  return s1[i] - s2[i];
 }

 void *memset(void *s, int c, size_t n) {
-  panic("Not implemented");
+  uint8_t *p = s;
+  for(int i = 0; i < n; i++) {
+    p[i] = c;
+  }
+  return s;
 }

 void *memmove(void *dst, const void *src, size_t n) {
-  panic("Not implemented");
+  if (src + n  > dst && src < dst) {
+    size_t len = dst - src;
+    void *p_dst = (void *)src + n;
+    const void *p_src = src + n - len;
+    while(p_dst >= dst) {
+      memcpy(p_dst, p_src, len);
+      p_src -= len;
+      p_dst -= len;
+    }
+    if(n % len) memcpy(dst, src, n % len);
+  } else if (dst < src && dst + n > src) {
+    size_t len = src - dst;
+    void *p_dst = dst;
+    const void *p_src = src;
+    while(p_src < src + n) {
+      memcpy(p_dst, p_src, len);
+      p_src += len;
+      p_dst += len;
+    }
+    if(n % len) memcpy(p_dst, p_src, n % len);
+  } else { 
+    memcpy(dst, src, n);
+  }
+
+  return dst;
 }

 void *memcpy(void *out, const void *in, size_t n) {
-  panic("Not implemented");
+  for (size_t i = 0 ; i < n ; i++) {
+    *(uint8_t *)(out + i) = *(uint8_t *)(in + i);
+  }
+  return out;
 }

 int memcmp(const void *s1, const void *s2, size_t n) {
-  panic("Not implemented");
+  const uint8_t *p1 = s1, *p2 = s2;
+  for (int i = 0; i < n; i++) {
+    if(*p1 != *p2)
+      return p1 - p2;
+    p1++; p2++;
+  }
+  return 0;
 }

 #endif
--- a/abstract-machine/klib/tests/CMakeLists.txt
+++ b/abstract-machine/klib/tests/CMakeLists.txt
@ -0,0 +1,17 @@
+set(TEST_SOURCES
+    stdio
+    string
+)
+
+foreach(TEST IN LISTS TEST_SOURCES)
+    add_executable(${TEST} ${TEST}.c)
+    target_link_libraries(${TEST} am-${ARCH} klib m)
+    target_include_directories(${TEST}
+        PRIVATE $<TARGET_PROPERTY:am_interface,INTERFACE_INCLUDE_DIRECTORIES>
+        PRIVATE $<TARGET_PROPERTY:klib_interface,INTERFACE_INCLUDE_DIRECTORIES>
+    )
+    # TODO: Run tests in other configurations
+    if(__PLATFORM_NATIVE__)
+        add_test(NAME ${TEST} COMMAND ${TEST})
+    endif()
+endforeach()
--- a/abstract-machine/klib/tests/stdio.c
+++ b/abstract-machine/klib/tests/stdio.c
@ -0,0 +1,5 @@
+#include <klib.h>
+
+int main(void) {
+    return 0;
+}
--- a/abstract-machine/klib/tests/string.c
+++ b/abstract-machine/klib/tests/string.c
@ -0,0 +1,75 @@
+#include <klib.h>
+#include <klib-macros.h>
+#include <stdint.h>
+
+void test_strcpy() {
+    char b[32];
+    char *s;
+ 	b[16]='a'; b[17]='b'; b[18]='c'; b[19]=0;
+ 	panic_on((s = strcpy(b, b+16)) != b, "strcpy wrong return value");
+	panic_on(strcmp(s, "abc") != 0, "strcpy gave incorrect string");
+ 	panic_on((s = strcpy(b+1, b+16)) != b+1, "strcpy wrong return value");
+	panic_on(strcmp(s, "abc") != 0, "strcpy gave incorrect string");
+
+	panic_on((s = strcpy(b+1, b+17)) != b+1, "strcpy wrong return value");
+	panic_on(strcmp(s, "bc") != 0, "strcpy gave incorrect string");
+}
+
+void test_strncpy() {
+    char b[32];
+	char *s;
+	int i;
+    b[3] = 'x'; b[4] = 0;
+	panic_on((s = strncpy(b, "abc", 3)) != b, "strncpy wrong return value");
+    panic_on(b[2] != 'c', "strncpy fails to copy last byte");
+    panic_on(b[3] != 'x', "strncpy overruns buffer to null-terminate");
+}
+
+void test_strncmp() {
+	panic_on(strncmp("abcd", "abce", 3) != 0, "strncmp compares past n");
+	panic_on(strncmp("abc", "abd", 3) == 0, "strncmp fails to compare n-1st byte");
+}
+
+void test_memset() {
+    uint8_t arr[128];
+    arr[120] = 0xd;
+    panic_on(memset(arr, 0xf, 120) != arr, "memset wrong return value");
+    panic_on(arr[7] != 0xf, "memset fails to set value in range");
+    panic_on(arr[120] != 0xd, "memset set value past n");
+}
+
+void test_memcpy() {
+    const uint8_t src[] = { 0x0, 0x0, 0x1, 0x2, 0x3, 0x4, 0x0, 0x0 };
+    uint8_t dst[8] = {0};
+    memcpy(dst, src, 8);
+    panic_on(memcmp(dst, src, 8) != 0, "memcpy fails to copy memory");
+}
+
+void test_memmove() {
+    const uint8_t ref[] = { 0x0, 0x0, 0x1, 0x2, 0x3, 0x4, 0x0, 0x0 };
+    uint8_t dst[8] = {0};
+    const uint8_t ans1[] = { 0x1, 0x2, 0x3, 0x4, 0x3, 0x4, 0x0, 0x0 };
+    const uint8_t ans2[] = { 0x1, 0x2, 0x2, 0x3, 0x4, 0x3, 0x0, 0x0 };
+    const uint8_t ans3[] = { 0x1, 0x2, 0x2, 0x1, 0x2, 0x2, 0x3, 0x4 };
+    memmove(dst, ref, 8);
+    panic_on(memcmp(dst, ref, 8) != 0, "memmove fails to copy non-overlapping memory");
+
+    memmove(dst, dst + 2, 4);
+    panic_on(memcmp(dst, ans1, 8) != 0, "memmove fails to copy overlapping memory (dst < src)");
+
+    memmove(dst + 2, dst + 1, 4);
+    panic_on(memcmp(dst, ans2, 8) != 0, "memmove fails to copy overlapping memory (src < dst)");
+
+    memmove(dst + 3, dst, 5);
+    panic_on(memcmp(dst, ans3, 8) != 0, "memmove fails to copy overlapping memory (src < dst)");
+}
+
+int main(void) {
+    test_strcpy();
+    test_strncpy();
+    test_strncmp();
+    test_memset();
+    test_memcpy();
+    test_memmove();
+    return 0;
+}
--- a/abstract-machine/out/install/lib/libklib.a
+++ b/abstract-machine/out/install/lib/libklib.a
--- a/am-kernels/.direnv/flake-profile
+++ b/am-kernels/.direnv/flake-profile
@ -0,0 +1 @@
+flake-profile-79-link
--- a/am-kernels/.direnv/flake-profile-79-link
+++ b/am-kernels/.direnv/flake-profile-79-link
@ -0,0 +1 @@
+/nix/store/jn4rd289315ip9fx03z2dm980wzg4iaz-am-kernels-2024.02.18-env
--- a/am-kernels/.envrc
+++ b/am-kernels/.envrc
@ -0,0 +1 @@
+use flake ..#am-kernels
--- a/am-kernels/.git.bak/HEAD
+++ b/am-kernels/.git.bak/HEAD
@ -0,0 +1 @@
+ref: refs/heads/ics2021
--- a/am-kernels/.git.bak/config
+++ b/am-kernels/.git.bak/config
@ -0,0 +1,11 @@
+[core]
+	repositoryformatversion = 0
+	filemode = true
+	bare = false
+	logallrefupdates = true
+[remote "origin"]
+	url = git@github.com:NJU-ProjectN/am-kernels.git
+	fetch = +refs/heads/*:refs/remotes/origin/*
+[branch "ics2021"]
+	remote = origin
+	merge = refs/heads/ics2021
--- a/am-kernels/.git.bak/description
+++ b/am-kernels/.git.bak/description
@ -0,0 +1 @@
+Unnamed repository; edit this file 'description' to name the repository.
--- a/am-kernels/.git.bak/hooks/applypatch-msg.sample
+++ b/am-kernels/.git.bak/hooks/applypatch-msg.sample
@ -0,0 +1,15 @@
+#!/nix/store/087167dfxal194pm54cmcbbxsfy3cjgn-bash-5.2p26/bin/bash
+#
+# An example hook script to check the commit log message taken by
+# applypatch from an e-mail message.
+#
+# The hook should exit with non-zero status after issuing an
+# appropriate message if it wants to stop the commit.  The hook is
+# allowed to edit the commit message file.
+#
+# To enable this hook, rename this file to "applypatch-msg".
+
+. git-sh-setup
+commitmsg="$(git rev-parse --git-path hooks/commit-msg)"
+test -x "$commitmsg" && exec "$commitmsg" ${1+"$@"}
+:
--- a/am-kernels/.git.bak/hooks/commit-msg.sample
+++ b/am-kernels/.git.bak/hooks/commit-msg.sample
@ -0,0 +1,24 @@
+#!/nix/store/087167dfxal194pm54cmcbbxsfy3cjgn-bash-5.2p26/bin/bash
+#
+# An example hook script to check the commit log message.
+# Called by "git commit" with one argument, the name of the file
+# that has the commit message.  The hook should exit with non-zero
+# status after issuing an appropriate message if it wants to stop the
+# commit.  The hook is allowed to edit the commit message file.
+#
+# To enable this hook, rename this file to "commit-msg".
+
+# Uncomment the below to add a Signed-off-by line to the message.
+# Doing this in a hook is a bad idea in general, but the prepare-commit-msg
+# hook is more suited to it.
+#
+# SOB=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
+# grep -qs "^$SOB" "$1" || echo "$SOB" >> "$1"
+
+# This example catches duplicate Signed-off-by lines.
+
+test "" = "$(grep '^Signed-off-by: ' "$1" |
+	 sort | uniq -c | sed -e '/^[ 	]*1[ 	]/d')" || {
+	echo >&2 Duplicate Signed-off-by lines.
+	exit 1
+}
--- a/am-kernels/.git.bak/hooks/fsmonitor-watchman.sample
+++ b/am-kernels/.git.bak/hooks/fsmonitor-watchman.sample
@ -0,0 +1,174 @@
+#!/nix/store/jr2c1rk91nqlfz5a5lwfq2kyilxzj879-perl-5.38.2/bin/perl
+
+use strict;
+use warnings;
+use IPC::Open2;
+
+# An example hook script to integrate Watchman
+# (https://facebook.github.io/watchman/) with git to speed up detecting
+# new and modified files.
+#
+# The hook is passed a version (currently 2) and last update token
+# formatted as a string and outputs to stdout a new update token and
+# all files that have been modified since the update token. Paths must
+# be relative to the root of the working tree and separated by a single NUL.
+#
+# To enable this hook, rename this file to "query-watchman" and set
+# 'git config core.fsmonitor .git/hooks/query-watchman'
+#
+my ($version, $last_update_token) = @ARGV;
+
+# Uncomment for debugging
+# print STDERR "$0 $version $last_update_token\n";
+
+# Check the hook interface version
+if ($version ne 2) {
+	die "Unsupported query-fsmonitor hook version '$version'.\n" .
+	    "Falling back to scanning...\n";
+}
+
+my $git_work_tree = get_working_dir();
+
+my $retry = 1;
+
+my $json_pkg;
+eval {
+	require JSON::XS;
+	$json_pkg = "JSON::XS";
+	1;
+} or do {
+	require JSON::PP;
+	$json_pkg = "JSON::PP";
+};
+
+launch_watchman();
+
+sub launch_watchman {
+	my $o = watchman_query();
+	if (is_work_tree_watched($o)) {
+		output_result($o->{clock}, @{$o->{files}});
+	}
+}
+
+sub output_result {
+	my ($clockid, @files) = @_;
+
+	# Uncomment for debugging watchman output
+	# open (my $fh, ">", ".git/watchman-output.out");
+	# binmode $fh, ":utf8";
+	# print $fh "$clockid\n@files\n";
+	# close $fh;
+
+	binmode STDOUT, ":utf8";
+	print $clockid;
+	print "\0";
+	local $, = "\0";
+	print @files;
+}
+
+sub watchman_clock {
+	my $response = qx/watchman clock "$git_work_tree"/;
+	die "Failed to get clock id on '$git_work_tree'.\n" .
+		"Falling back to scanning...\n" if $? != 0;
+
+	return $json_pkg->new->utf8->decode($response);
+}
+
+sub watchman_query {
+	my $pid = open2(\*CHLD_OUT, \*CHLD_IN, 'watchman -j --no-pretty')
+	or die "open2() failed: $!\n" .
+	"Falling back to scanning...\n";
+
+	# In the query expression below we're asking for names of files that
+	# changed since $last_update_token but not from the .git folder.
+	#
+	# To accomplish this, we're using the "since" generator to use the
+	# recency index to select candidate nodes and "fields" to limit the
+	# output to file names only. Then we're using the "expression" term to
+	# further constrain the results.
+	my $last_update_line = "";
+	if (substr($last_update_token, 0, 1) eq "c") {
+		$last_update_token = "\"$last_update_token\"";
+		$last_update_line = qq[\n"since": $last_update_token,];
+	}
+	my $query = <<"	END";
+		["query", "$git_work_tree", {$last_update_line
+			"fields": ["name"],
+			"expression": ["not", ["dirname", ".git"]]
+		}]
+	END
+
+	# Uncomment for debugging the watchman query
+	# open (my $fh, ">", ".git/watchman-query.json");
+	# print $fh $query;
+	# close $fh;
+
+	print CHLD_IN $query;
+	close CHLD_IN;
+	my $response = do {local $/; <CHLD_OUT>};
+
+	# Uncomment for debugging the watch response
+	# open ($fh, ">", ".git/watchman-response.json");
+	# print $fh $response;
+	# close $fh;
+
+	die "Watchman: command returned no output.\n" .
+	"Falling back to scanning...\n" if $response eq "";
+	die "Watchman: command returned invalid output: $response\n" .
+	"Falling back to scanning...\n" unless $response =~ /^\{/;
+
+	return $json_pkg->new->utf8->decode($response);
+}
+
+sub is_work_tree_watched {
+	my ($output) = @_;
+	my $error = $output->{error};
+	if ($retry > 0 and $error and $error =~ m/unable to resolve root .* directory (.*) is not watched/) {
+		$retry--;
+		my $response = qx/watchman watch "$git_work_tree"/;
+		die "Failed to make watchman watch '$git_work_tree'.\n" .
+		    "Falling back to scanning...\n" if $? != 0;
+		$output = $json_pkg->new->utf8->decode($response);
+		$error = $output->{error};
+		die "Watchman: $error.\n" .
+		"Falling back to scanning...\n" if $error;
+
+		# Uncomment for debugging watchman output
+		# open (my $fh, ">", ".git/watchman-output.out");
+		# close $fh;
+
+		# Watchman will always return all files on the first query so
+		# return the fast "everything is dirty" flag to git and do the
+		# Watchman query just to get it over with now so we won't pay
+		# the cost in git to look up each individual file.
+		my $o = watchman_clock();
+		$error = $output->{error};
+
+		die "Watchman: $error.\n" .
+		"Falling back to scanning...\n" if $error;
+
+		output_result($o->{clock}, ("/"));
+		$last_update_token = $o->{clock};
+
+		eval { launch_watchman() };
+		return 0;
+	}
+
+	die "Watchman: $error.\n" .
+	"Falling back to scanning...\n" if $error;
+
+	return 1;
+}
+
+sub get_working_dir {
+	my $working_dir;
+	if ($^O =~ 'msys' || $^O =~ 'cygwin') {
+		$working_dir = Win32::GetCwd();
+		$working_dir =~ tr/\\/\//;
+	} else {
+		require Cwd;
+		$working_dir = Cwd::cwd();
+	}
+
+	return $working_dir;
+}
--- a/am-kernels/.git.bak/hooks/post-update.sample
+++ b/am-kernels/.git.bak/hooks/post-update.sample
@ -0,0 +1,8 @@
+#!/nix/store/087167dfxal194pm54cmcbbxsfy3cjgn-bash-5.2p26/bin/bash
+#
+# An example hook script to prepare a packed repository for use over
+# dumb transports.
+#
+# To enable this hook, rename this file to "post-update".
+
+exec git update-server-info
--- a/am-kernels/.git.bak/hooks/pre-applypatch.sample
+++ b/am-kernels/.git.bak/hooks/pre-applypatch.sample
@ -0,0 +1,14 @@
+#!/nix/store/087167dfxal194pm54cmcbbxsfy3cjgn-bash-5.2p26/bin/bash
+#
+# An example hook script to verify what is about to be committed
+# by applypatch from an e-mail message.
+#
+# The hook should exit with non-zero status after issuing an
+# appropriate message if it wants to stop the commit.
+#
+# To enable this hook, rename this file to "pre-applypatch".
+
+. git-sh-setup
+precommit="$(git rev-parse --git-path hooks/pre-commit)"
+test -x "$precommit" && exec "$precommit" ${1+"$@"}
+:
--- a/am-kernels/.git.bak/hooks/pre-commit.sample
+++ b/am-kernels/.git.bak/hooks/pre-commit.sample
@ -0,0 +1,49 @@
+#!/nix/store/087167dfxal194pm54cmcbbxsfy3cjgn-bash-5.2p26/bin/bash
+#
+# An example hook script to verify what is about to be committed.
+# Called by "git commit" with no arguments.  The hook should
+# exit with non-zero status after issuing an appropriate message if
+# it wants to stop the commit.
+#
+# To enable this hook, rename this file to "pre-commit".
+
+if git rev-parse --verify HEAD >/dev/null 2>&1
+then
+	against=HEAD
+else
+	# Initial commit: diff against an empty tree object
+	against=$(git hash-object -t tree /dev/null)
+fi
+
+# If you want to allow non-ASCII filenames set this variable to true.
+allownonascii=$(git config --type=bool hooks.allownonascii)
+
+# Redirect output to stderr.
+exec 1>&2
+
+# Cross platform projects tend to avoid non-ASCII filenames; prevent
+# them from being added to the repository. We exploit the fact that the
+# printable range starts at the space character and ends with tilde.
+if [ "$allownonascii" != "true" ] &&
+	# Note that the use of brackets around a tr range is ok here, (it's
+	# even required, for portability to Solaris 10's /usr/bin/tr), since
+	# the square bracket bytes happen to fall in the designated range.
+	test $(git diff-index --cached --name-only --diff-filter=A -z $against |
+	  LC_ALL=C tr -d '[ -~]\0' | wc -c) != 0
+then
+	cat <<\EOF
+Error: Attempt to add a non-ASCII file name.
+
+This can cause problems if you want to work with people on other platforms.
+
+To be portable it is advisable to rename the file.
+
+If you know what you are doing you can disable this check using:
+
+  git config hooks.allownonascii true
+EOF
+	exit 1
+fi
+
+# If there are whitespace errors, print the offending file names and fail.
+exec git diff-index --check --cached $against --
--- a/am-kernels/.git.bak/hooks/pre-merge-commit.sample
+++ b/am-kernels/.git.bak/hooks/pre-merge-commit.sample
@ -0,0 +1,13 @@
+#!/nix/store/087167dfxal194pm54cmcbbxsfy3cjgn-bash-5.2p26/bin/bash
+#
+# An example hook script to verify what is about to be committed.
+# Called by "git merge" with no arguments.  The hook should
+# exit with non-zero status after issuing an appropriate message to
+# stderr if it wants to stop the merge commit.
+#
+# To enable this hook, rename this file to "pre-merge-commit".
+
+. git-sh-setup
+test -x "$GIT_DIR/hooks/pre-commit" &&
+        exec "$GIT_DIR/hooks/pre-commit"
+:
--- a/am-kernels/.git.bak/hooks/pre-push.sample
+++ b/am-kernels/.git.bak/hooks/pre-push.sample
@ -0,0 +1,53 @@
+#!/nix/store/087167dfxal194pm54cmcbbxsfy3cjgn-bash-5.2p26/bin/bash
+
+# An example hook script to verify what is about to be pushed.  Called by "git
+# push" after it has checked the remote status, but before anything has been
+# pushed.  If this script exits with a non-zero status nothing will be pushed.
+#
+# This hook is called with the following parameters:
+#
+# $1 -- Name of the remote to which the push is being done
+# $2 -- URL to which the push is being done
+#
+# If pushing without using a named remote those arguments will be equal.
+#
+# Information about the commits which are being pushed is supplied as lines to
+# the standard input in the form:
+#
+#   <local ref> <local oid> <remote ref> <remote oid>
+#
+# This sample shows how to prevent push of commits where the log message starts
+# with "WIP" (work in progress).
+
+remote="$1"
+url="$2"
+
+zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
+
+while read local_ref local_oid remote_ref remote_oid
+do
+	if test "$local_oid" = "$zero"
+	then
+		# Handle delete
+		:
+	else
+		if test "$remote_oid" = "$zero"
+		then
+			# New branch, examine all commits
+			range="$local_oid"
+		else
+			# Update to existing branch, examine new commits
+			range="$remote_oid..$local_oid"
+		fi
+
+		# Check for WIP commit
+		commit=$(git rev-list -n 1 --grep '^WIP' "$range")
+		if test -n "$commit"
+		then
+			echo >&2 "Found WIP commit in $local_ref, not pushing"
+			exit 1
+		fi
+	fi
+done
+
+exit 0
--- a/am-kernels/.git.bak/hooks/pre-rebase.sample
+++ b/am-kernels/.git.bak/hooks/pre-rebase.sample
@ -0,0 +1,169 @@
+#!/nix/store/087167dfxal194pm54cmcbbxsfy3cjgn-bash-5.2p26/bin/bash
+#
+# Copyright (c) 2006, 2008 Junio C Hamano
+#
+# The "pre-rebase" hook is run just before "git rebase" starts doing
+# its job, and can prevent the command from running by exiting with
+# non-zero status.
+#
+# The hook is called with the following parameters:
+#
+# $1 -- the upstream the series was forked from.
+# $2 -- the branch being rebased (or empty when rebasing the current branch).
+#
+# This sample shows how to prevent topic branches that are already
+# merged to 'next' branch from getting rebased, because allowing it
+# would result in rebasing already published history.
+
+publish=next
+basebranch="$1"
+if test "$#" = 2
+then
+	topic="refs/heads/$2"
+else
+	topic=`git symbolic-ref HEAD` ||
+	exit 0 ;# we do not interrupt rebasing detached HEAD
+fi
+
+case "$topic" in
+refs/heads/??/*)
+	;;
+*)
+	exit 0 ;# we do not interrupt others.
+	;;
+esac
+
+# Now we are dealing with a topic branch being rebased
+# on top of master.  Is it OK to rebase it?
+
+# Does the topic really exist?
+git show-ref -q "$topic" || {
+	echo >&2 "No such branch $topic"
+	exit 1
+}
+
+# Is topic fully merged to master?
+not_in_master=`git rev-list --pretty=oneline ^master "$topic"`
+if test -z "$not_in_master"
+then
+	echo >&2 "$topic is fully merged to master; better remove it."
+	exit 1 ;# we could allow it, but there is no point.
+fi
+
+# Is topic ever merged to next?  If so you should not be rebasing it.
+only_next_1=`git rev-list ^master "^$topic" ${publish} | sort`
+only_next_2=`git rev-list ^master           ${publish} | sort`
+if test "$only_next_1" = "$only_next_2"
+then
+	not_in_topic=`git rev-list "^$topic" master`
+	if test -z "$not_in_topic"
+	then
+		echo >&2 "$topic is already up to date with master"
+		exit 1 ;# we could allow it, but there is no point.
+	else
+		exit 0
+	fi
+else
+	not_in_next=`git rev-list --pretty=oneline ^${publish} "$topic"`
+	/nix/store/jr2c1rk91nqlfz5a5lwfq2kyilxzj879-perl-5.38.2/bin/perl -e '
+		my $topic = $ARGV[0];
+		my $msg = "* $topic has commits already merged to public branch:\n";
+		my (%not_in_next) = map {
+			/^([0-9a-f]+) /;
+			($1 => 1);
+		} split(/\n/, $ARGV[1]);
+		for my $elem (map {
+				/^([0-9a-f]+) (.*)$/;
+				[$1 => $2];
+			} split(/\n/, $ARGV[2])) {
+			if (!exists $not_in_next{$elem->[0]}) {
+				if ($msg) {
+					print STDERR $msg;
+					undef $msg;
+				}
+				print STDERR " $elem->[1]\n";
+			}
+		}
+	' "$topic" "$not_in_next" "$not_in_master"
+	exit 1
+fi
+
+<<\DOC_END
+
+This sample hook safeguards topic branches that have been
+published from being rewound.
+
+The workflow assumed here is:
+
+ * Once a topic branch forks from "master", "master" is never
+   merged into it again (either directly or indirectly).
+
+ * Once a topic branch is fully cooked and merged into "master",
+   it is deleted.  If you need to build on top of it to correct
+   earlier mistakes, a new topic branch is created by forking at
+   the tip of the "master".  This is not strictly necessary, but
+   it makes it easier to keep your history simple.
+
+ * Whenever you need to test or publish your changes to topic
+   branches, merge them into "next" branch.
+
+The script, being an example, hardcodes the publish branch name
+to be "next", but it is trivial to make it configurable via
+$GIT_DIR/config mechanism.
+
+With this workflow, you would want to know:
+
+(1) ... if a topic branch has ever been merged to "next".  Young
+    topic branches can have stupid mistakes you would rather
+    clean up before publishing, and things that have not been
+    merged into other branches can be easily rebased without
+    affecting other people.  But once it is published, you would
+    not want to rewind it.
+
+(2) ... if a topic branch has been fully merged to "master".
+    Then you can delete it.  More importantly, you should not
+    build on top of it -- other people may already want to
+    change things related to the topic as patches against your
+    "master", so if you need further changes, it is better to
+    fork the topic (perhaps with the same name) afresh from the
+    tip of "master".
+
+Let's look at this example:
+
+		   o---o---o---o---o---o---o---o---o---o "next"
+		  /       /           /           /
+		 /   a---a---b A     /           /
+		/   /               /           /
+	       /   /   c---c---c---c B         /
+	      /   /   /             \         /
+	     /   /   /   b---b C     \       /
+	    /   /   /   /             \     /
+    ---o---o---o---o---o---o---o---o---o---o---o "master"
+
+
+A, B and C are topic branches.
+
+ * A has one fix since it was merged up to "next".
+
+ * B has finished.  It has been fully merged up to "master" and "next",
+   and is ready to be deleted.
+
+ * C has not merged to "next" at all.
+
+We would want to allow C to be rebased, refuse A, and encourage
+B to be deleted.
+
+To compute (1):
+
+	git rev-list ^master ^topic next
+	git rev-list ^master        next
+
+	if these match, topic has not merged in next at all.
+
+To compute (2):
+
+	git rev-list master..topic
+
+	if this is empty, it is fully merged to "master".
+
+DOC_END
--- a/am-kernels/.git.bak/hooks/pre-receive.sample
+++ b/am-kernels/.git.bak/hooks/pre-receive.sample
@ -0,0 +1,24 @@
+#!/nix/store/087167dfxal194pm54cmcbbxsfy3cjgn-bash-5.2p26/bin/bash
+#
+# An example hook script to make use of push options.
+# The example simply echoes all push options that start with 'echoback='
+# and rejects all pushes when the "reject" push option is used.
+#
+# To enable this hook, rename this file to "pre-receive".
+
+if test -n "$GIT_PUSH_OPTION_COUNT"
+then
+	i=0
+	while test "$i" -lt "$GIT_PUSH_OPTION_COUNT"
+	do
+		eval "value=\$GIT_PUSH_OPTION_$i"
+		case "$value" in
+		echoback=*)
+			echo "echo from the pre-receive-hook: ${value#*=}" >&2
+			;;
+		reject)
+			exit 1
+		esac
+		i=$((i + 1))
+	done
+fi
--- a/am-kernels/.git.bak/hooks/prepare-commit-msg.sample
+++ b/am-kernels/.git.bak/hooks/prepare-commit-msg.sample
@ -0,0 +1,42 @@
+#!/nix/store/087167dfxal194pm54cmcbbxsfy3cjgn-bash-5.2p26/bin/bash
+#
+# An example hook script to prepare the commit log message.
+# Called by "git commit" with the name of the file that has the
+# commit message, followed by the description of the commit
+# message's source.  The hook's purpose is to edit the commit
+# message file.  If the hook fails with a non-zero status,
+# the commit is aborted.
+#
+# To enable this hook, rename this file to "prepare-commit-msg".
+
+# This hook includes three examples. The first one removes the
+# "# Please enter the commit message..." help message.
+#
+# The second includes the output of "git diff --name-status -r"
+# into the message, just before the "git status" output.  It is
+# commented because it doesn't cope with --amend or with squashed
+# commits.
+#
+# The third example adds a Signed-off-by line to the message, that can
+# still be edited.  This is rarely a good idea.
+
+COMMIT_MSG_FILE=$1
+COMMIT_SOURCE=$2
+SHA1=$3
+
+/nix/store/jr2c1rk91nqlfz5a5lwfq2kyilxzj879-perl-5.38.2/bin/perl -i.bak -ne 'print unless(m/^. Please enter the commit message/..m/^#$/)' "$COMMIT_MSG_FILE"
+
+# case "$COMMIT_SOURCE,$SHA1" in
+#  ,|template,)
+#    /nix/store/jr2c1rk91nqlfz5a5lwfq2kyilxzj879-perl-5.38.2/bin/perl -i.bak -pe '
+#       print "\n" . `git diff --cached --name-status -r`
+# 	 if /^#/ && $first++ == 0' "$COMMIT_MSG_FILE" ;;
+#  *) ;;
+# esac
+
+# SOB=$(git var GIT_COMMITTER_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p')
+# git interpret-trailers --in-place --trailer "$SOB" "$COMMIT_MSG_FILE"
+# if test -z "$COMMIT_SOURCE"
+# then
+#   /nix/store/jr2c1rk91nqlfz5a5lwfq2kyilxzj879-perl-5.38.2/bin/perl -i.bak -pe 'print "\n" if !$first_line++' "$COMMIT_MSG_FILE"
+# fi
--- a/am-kernels/.git.bak/hooks/push-to-checkout.sample
+++ b/am-kernels/.git.bak/hooks/push-to-checkout.sample
@ -0,0 +1,78 @@
+#!/nix/store/087167dfxal194pm54cmcbbxsfy3cjgn-bash-5.2p26/bin/bash
+
+# An example hook script to update a checked-out tree on a git push.
+#
+# This hook is invoked by git-receive-pack(1) when it reacts to git
+# push and updates reference(s) in its repository, and when the push
+# tries to update the branch that is currently checked out and the
+# receive.denyCurrentBranch configuration variable is set to
+# updateInstead.
+#
+# By default, such a push is refused if the working tree and the index
+# of the remote repository has any difference from the currently
+# checked out commit; when both the working tree and the index match
+# the current commit, they are updated to match the newly pushed tip
+# of the branch. This hook is to be used to override the default
+# behaviour; however the code below reimplements the default behaviour
+# as a starting point for convenient modification.
+#
+# The hook receives the commit with which the tip of the current
+# branch is going to be updated:
+commit=$1
+
+# It can exit with a non-zero status to refuse the push (when it does
+# so, it must not modify the index or the working tree).
+die () {
+	echo >&2 "$*"
+	exit 1
+}
+
+# Or it can make any necessary changes to the working tree and to the
+# index to bring them to the desired state when the tip of the current
+# branch is updated to the new commit, and exit with a zero status.
+#
+# For example, the hook can simply run git read-tree -u -m HEAD "$1"
+# in order to emulate git fetch that is run in the reverse direction
+# with git push, as the two-tree form of git read-tree -u -m is
+# essentially the same as git switch or git checkout that switches
+# branches while keeping the local changes in the working tree that do
+# not interfere with the difference between the branches.
+
+# The below is a more-or-less exact translation to shell of the C code
+# for the default behaviour for git's push-to-checkout hook defined in
+# the push_to_deploy() function in builtin/receive-pack.c.
+#
+# Note that the hook will be executed from the repository directory,
+# not from the working tree, so if you want to perform operations on
+# the working tree, you will have to adapt your code accordingly, e.g.
+# by adding "cd .." or using relative paths.
+
+if ! git update-index -q --ignore-submodules --refresh
+then
+	die "Up-to-date check failed"
+fi
+
+if ! git diff-files --quiet --ignore-submodules --
+then
+	die "Working directory has unstaged changes"
+fi
+
+# This is a rough translation of:
+#
+#   head_has_history() ? "HEAD" : EMPTY_TREE_SHA1_HEX
+if git cat-file -e HEAD 2>/dev/null
+then
+	head=HEAD
+else
+	head=$(git hash-object -t tree --stdin </dev/null)
+fi
+
+if ! git diff-index --quiet --cached --ignore-submodules $head --
+then
+	die "Working directory has staged changes"
+fi
+
+if ! git read-tree -u -m "$commit"
+then
+	die "Could not update working tree to new HEAD"
+fi
--- a/am-kernels/.git.bak/hooks/sendemail-validate.sample
+++ b/am-kernels/.git.bak/hooks/sendemail-validate.sample
@ -0,0 +1,77 @@
+#!/nix/store/087167dfxal194pm54cmcbbxsfy3cjgn-bash-5.2p26/bin/bash
+
+# An example hook script to validate a patch (and/or patch series) before
+# sending it via email.
+#
+# The hook should exit with non-zero status after issuing an appropriate
+# message if it wants to prevent the email(s) from being sent.
+#
+# To enable this hook, rename this file to "sendemail-validate".
+#
+# By default, it will only check that the patch(es) can be applied on top of
+# the default upstream branch without conflicts in a secondary worktree. After
+# validation (successful or not) of the last patch of a series, the worktree
+# will be deleted.
+#
+# The following config variables can be set to change the default remote and
+# remote ref that are used to apply the patches against:
+#
+#   sendemail.validateRemote (default: origin)
+#   sendemail.validateRemoteRef (default: HEAD)
+#
+# Replace the TODO placeholders with appropriate checks according to your
+# needs.
+
+validate_cover_letter () {
+	file="$1"
+	# TODO: Replace with appropriate checks (e.g. spell checking).
+	true
+}
+
+validate_patch () {
+	file="$1"
+	# Ensure that the patch applies without conflicts.
+	git am -3 "$file" || return
+	# TODO: Replace with appropriate checks for this patch
+	# (e.g. checkpatch.pl).
+	true
+}
+
+validate_series () {
+	# TODO: Replace with appropriate checks for the whole series
+	# (e.g. quick build, coding style checks, etc.).
+	true
+}
+
+# main -------------------------------------------------------------------------
+
+if test "$GIT_SENDEMAIL_FILE_COUNTER" = 1
+then
+	remote=$(git config --default origin --get sendemail.validateRemote) &&
+	ref=$(git config --default HEAD --get sendemail.validateRemoteRef) &&
+	worktree=$(mktemp --tmpdir -d sendemail-validate.XXXXXXX) &&
+	git worktree add -fd --checkout "$worktree" "refs/remotes/$remote/$ref" &&
+	git config --replace-all sendemail.validateWorktree "$worktree"
+else
+	worktree=$(git config --get sendemail.validateWorktree)
+fi || {
+	echo "sendemail-validate: error: failed to prepare worktree" >&2
+	exit 1
+}
+
+unset GIT_DIR GIT_WORK_TREE
+cd "$worktree" &&
+
+if grep -q "^diff --git " "$1"
+then
+	validate_patch "$1"
+else
+	validate_cover_letter "$1"
+fi &&
+
+if test "$GIT_SENDEMAIL_FILE_COUNTER" = "$GIT_SENDEMAIL_FILE_TOTAL"
+then
+	git config --unset-all sendemail.validateWorktree &&
+	trap 'git worktree remove -ff "$worktree"' EXIT &&
+	validate_series
+fi
--- a/am-kernels/.git.bak/hooks/update.sample
+++ b/am-kernels/.git.bak/hooks/update.sample
@ -0,0 +1,128 @@
+#!/nix/store/087167dfxal194pm54cmcbbxsfy3cjgn-bash-5.2p26/bin/bash
+#
+# An example hook script to block unannotated tags from entering.
+# Called by "git receive-pack" with arguments: refname sha1-old sha1-new
+#
+# To enable this hook, rename this file to "update".
+#
+# Config
+# ------
+# hooks.allowunannotated
+#   This boolean sets whether unannotated tags will be allowed into the
+#   repository.  By default they won't be.
+# hooks.allowdeletetag
+#   This boolean sets whether deleting tags will be allowed in the
+#   repository.  By default they won't be.
+# hooks.allowmodifytag
+#   This boolean sets whether a tag may be modified after creation. By default
+#   it won't be.
+# hooks.allowdeletebranch
+#   This boolean sets whether deleting branches will be allowed in the
+#   repository.  By default they won't be.
+# hooks.denycreatebranch
+#   This boolean sets whether remotely creating branches will be denied
+#   in the repository.  By default this is allowed.
+#
+
+# --- Command line
+refname="$1"
+oldrev="$2"
+newrev="$3"
+
+# --- Safety check
+if [ -z "$GIT_DIR" ]; then
+	echo "Don't run this script from the command line." >&2
+	echo " (if you want, you could supply GIT_DIR then run" >&2
+	echo "  $0 <ref> <oldrev> <newrev>)" >&2
+	exit 1
+fi
+
+if [ -z "$refname" -o -z "$oldrev" -o -z "$newrev" ]; then
+	echo "usage: $0 <ref> <oldrev> <newrev>" >&2
+	exit 1
+fi
+
+# --- Config
+allowunannotated=$(git config --type=bool hooks.allowunannotated)
+allowdeletebranch=$(git config --type=bool hooks.allowdeletebranch)
+denycreatebranch=$(git config --type=bool hooks.denycreatebranch)
+allowdeletetag=$(git config --type=bool hooks.allowdeletetag)
+allowmodifytag=$(git config --type=bool hooks.allowmodifytag)
+
+# check for no description
+projectdesc=$(sed -e '1q' "$GIT_DIR/description")
+case "$projectdesc" in
+"Unnamed repository"* | "")
+	echo "*** Project description file hasn't been set" >&2
+	exit 1
+	;;
+esac
+
+# --- Check types
+# if $newrev is 0000...0000, it's a commit to delete a ref.
+zero=$(git hash-object --stdin </dev/null | tr '[0-9a-f]' '0')
+if [ "$newrev" = "$zero" ]; then
+	newrev_type=delete
+else
+	newrev_type=$(git cat-file -t $newrev)
+fi
+
+case "$refname","$newrev_type" in
+	refs/tags/*,commit)
+		# un-annotated tag
+		short_refname=${refname##refs/tags/}
+		if [ "$allowunannotated" != "true" ]; then
+			echo "*** The un-annotated tag, $short_refname, is not allowed in this repository" >&2
+			echo "*** Use 'git tag [ -a | -s ]' for tags you want to propagate." >&2
+			exit 1
+		fi
+		;;
+	refs/tags/*,delete)
+		# delete tag
+		if [ "$allowdeletetag" != "true" ]; then
+			echo "*** Deleting a tag is not allowed in this repository" >&2
+			exit 1
+		fi
+		;;
+	refs/tags/*,tag)
+		# annotated tag
+		if [ "$allowmodifytag" != "true" ] && git rev-parse $refname > /dev/null 2>&1
+		then
+			echo "*** Tag '$refname' already exists." >&2
+			echo "*** Modifying a tag is not allowed in this repository." >&2
+			exit 1
+		fi
+		;;
+	refs/heads/*,commit)
+		# branch
+		if [ "$oldrev" = "$zero" -a "$denycreatebranch" = "true" ]; then
+			echo "*** Creating a branch is not allowed in this repository" >&2
+			exit 1
+		fi
+		;;
+	refs/heads/*,delete)
+		# delete branch
+		if [ "$allowdeletebranch" != "true" ]; then
+			echo "*** Deleting a branch is not allowed in this repository" >&2
+			exit 1
+		fi
+		;;
+	refs/remotes/*,commit)
+		# tracking branch
+		;;
+	refs/remotes/*,delete)
+		# delete tracking branch
+		if [ "$allowdeletebranch" != "true" ]; then
+			echo "*** Deleting a tracking branch is not allowed in this repository" >&2
+			exit 1
+		fi
+		;;
+	*)
+		# Anything else (is there anything else?)
+		echo "*** Update hook: unknown type of update to ref $refname of type $newrev_type" >&2
+		exit 1
+		;;
+esac
+
+# --- Finished
+exit 0
--- a/am-kernels/.git.bak/index
+++ b/am-kernels/.git.bak/index
--- a/am-kernels/.git.bak/info/exclude
+++ b/am-kernels/.git.bak/info/exclude
@ -0,0 +1,6 @@
+# git ls-files --others --exclude-from=.git/info/exclude
+# Lines that start with '#' are comments.
+# For a project mostly in C, the following would be a good set of
+# exclude patterns (uncomment them if you want to use them):
+# *.[oa]
+# *~
--- a/am-kernels/.git.bak/logs/HEAD
+++ b/am-kernels/.git.bak/logs/HEAD
@ -0,0 +1 @@
+0000000000000000000000000000000000000000 bb725d6f8223dd7de831c3b692e8c4531e9d01af xinyangli <lixinyang411@gmail.com> 1709436368 +0800	clone: from github.com:NJU-ProjectN/am-kernels.git
--- a/am-kernels/.git.bak/logs/refs/heads/ics2021
+++ b/am-kernels/.git.bak/logs/refs/heads/ics2021
@ -0,0 +1 @@
+0000000000000000000000000000000000000000 bb725d6f8223dd7de831c3b692e8c4531e9d01af xinyangli <lixinyang411@gmail.com> 1709436368 +0800	clone: from github.com:NJU-ProjectN/am-kernels.git
--- a/am-kernels/.git.bak/logs/refs/remotes/origin/HEAD
+++ b/am-kernels/.git.bak/logs/refs/remotes/origin/HEAD
@ -0,0 +1 @@
+0000000000000000000000000000000000000000 bb725d6f8223dd7de831c3b692e8c4531e9d01af xinyangli <lixinyang411@gmail.com> 1709436368 +0800	clone: from github.com:NJU-ProjectN/am-kernels.git
--- a/am-kernels/.git.bak/objects/0f/559e690606f3feab9e02813cf2085b5c471b56
+++ b/am-kernels/.git.bak/objects/0f/559e690606f3feab9e02813cf2085b5c471b56
--- a/am-kernels/.git.bak/objects/36/b9a39f647bf13895df413debc9061d055249b0
+++ b/am-kernels/.git.bak/objects/36/b9a39f647bf13895df413debc9061d055249b0
--- a/am-kernels/.git.bak/objects/45/04c420cc600ee278b41dc8911acbfb4fdfa7c0
+++ b/am-kernels/.git.bak/objects/45/04c420cc600ee278b41dc8911acbfb4fdfa7c0
--- a/am-kernels/.git.bak/objects/4d/5393c3c53c4153c5db5227da8d3928226d2a5d
+++ b/am-kernels/.git.bak/objects/4d/5393c3c53c4153c5db5227da8d3928226d2a5d
--- a/am-kernels/.git.bak/objects/61/f185ad27e7f199809de98378cec302553e436f
+++ b/am-kernels/.git.bak/objects/61/f185ad27e7f199809de98378cec302553e436f
@ -0,0 +1 @@
+xM<>ΑjΓ0D{ΦW<CEA6><57>ƒtpJάC{<15>KLλ$X¦W!KΫDµ-»²T(!<21>^9%<25>Ϋξ,3σ¶ι†§/Ο<0F>—-<2D>ήXΣ‡^8ψΖ<>&yΕ‹ύ?<>”"4Ία”'²OZpΊ‰Ά	<ΙJφ–‹Lπ<4C>ν6¬Ϊΰυ<CEB0>"°²ι@t<>ƒ<Ι0γe1VuAΗύ<04>=9γ88<0F>"5Kb4 ¦›HfΗ'Y<>ο¬~έWε—¬Ξ¶9Η+½ω<C2BD>Ε?«²ν<C2B2>Ω zgΊλey.8»$w93΅<33>Ρ±@jV~pΏΔΓδ§G5†δ:Ρ?¬`+
--- a/am-kernels/.git.bak/objects/64/8f3ff6fbb33bf19992ed5cb7a0a2ec36039c69
+++ b/am-kernels/.git.bak/objects/64/8f3ff6fbb33bf19992ed5cb7a0a2ec36039c69
--- a/am-kernels/.git.bak/objects/75/c46af0731c3ce478cdb9b354ea50a72fa7f1bc
+++ b/am-kernels/.git.bak/objects/75/c46af0731c3ce478cdb9b354ea50a72fa7f1bc
--- a/am-kernels/.git.bak/objects/7a/bb6b5fc51cdaa1d5c45fb5bfca000843ca9b77
+++ b/am-kernels/.git.bak/objects/7a/bb6b5fc51cdaa1d5c45fb5bfca000843ca9b77
@ -0,0 +1,2 @@
+x<01>RÛn›@í+û#„%@Çq£J¸‰ŒGåAûP™(â²Ô«.ëˆÅj¥ªÿÞY0‰Ÿª>ÍîÌ™³svNÉ%|¼ºzç>n“ø›çÐ|¯K¨8-ð¢§²ÃôÃÐ"$
²¯á¸·£òÈ{b˜rO±ë!cÑBØ&	“ô9
n›_,—»Åj¹hO…û4â³ÂåTˆ“8xí¸h	ÁWg˜e!©(ZŠGqèkÖáa|¸a¢5£œ»àÝv+ÝRs ‘E]¿t´a¿ *~`äÔ}<7D>i%É"ÚšVûèƒNÎPìÎ0ºZN`=mMi=yú)EP·êö`æ½qÎyåÇü8ÔÌNŸ8½äPÐc?
+”0;YºÉÔñS.˜¨ø±F‘Æo?zÞ&Qðg>1êê‹×ÈÁhQ8œSà§›í5ÎƒÁÂù"ÿs°‰nï?Ì¬ô{* 'šöÒ1Ñ7 ïf‹ò	<09>ã’†]X<>~–Mµ+¨Õ†›³½®
å’þ‹wnÙ¶}ç?„þ‹²aDsÖ]«äLj]Ã&<04>8XEW…òàÉ`xGôÛ• ]G LëM±ugt6”GÆë9!£¡=òØøÑX
--- a/am-kernels/.git.bak/objects/e6/9de29bb2d1d6434b8b29ae775ad8c2e48c5391
+++ b/am-kernels/.git.bak/objects/e6/9de29bb2d1d6434b8b29ae775ad8c2e48c5391
--- a/am-kernels/.git.bak/objects/fe/95b9b6930339a0f004b6eb08a9ad9f5642e9f8
+++ b/am-kernels/.git.bak/objects/fe/95b9b6930339a0f004b6eb08a9ad9f5642e9f8
--- a/am-kernels/.git.bak/objects/pack/pack-fd014db4984fb9bfc8a49de0e4f554d0018fe65a.idx
+++ b/am-kernels/.git.bak/objects/pack/pack-fd014db4984fb9bfc8a49de0e4f554d0018fe65a.idx
--- a/am-kernels/.git.bak/objects/pack/pack-fd014db4984fb9bfc8a49de0e4f554d0018fe65a.pack
+++ b/am-kernels/.git.bak/objects/pack/pack-fd014db4984fb9bfc8a49de0e4f554d0018fe65a.pack
--- a/am-kernels/.git.bak/objects/pack/pack-fd014db4984fb9bfc8a49de0e4f554d0018fe65a.rev
+++ b/am-kernels/.git.bak/objects/pack/pack-fd014db4984fb9bfc8a49de0e4f554d0018fe65a.rev
--- a/am-kernels/.git.bak/packed-refs
+++ b/am-kernels/.git.bak/packed-refs
@ -0,0 +1,4 @@
+# pack-refs with: peeled fully-peeled sorted 
+9ab41b3e051a49789b458deb0153c9dfe8e93d00 refs/remotes/origin/ics2020
+bb725d6f8223dd7de831c3b692e8c4531e9d01af refs/remotes/origin/ics2021
+bb725d6f8223dd7de831c3b692e8c4531e9d01af refs/remotes/origin/master
--- a/am-kernels/.git.bak/refs/heads/ics2021
+++ b/am-kernels/.git.bak/refs/heads/ics2021
@ -0,0 +1 @@
+bb725d6f8223dd7de831c3b692e8c4531e9d01af
--- a/am-kernels/.git.bak/refs/remotes/origin/HEAD
+++ b/am-kernels/.git.bak/refs/remotes/origin/HEAD
@ -0,0 +1 @@
+ref: refs/remotes/origin/master
--- a/am-kernels/.gitignore
+++ b/am-kernels/.gitignore
@ -0,0 +1,15 @@
+*
+!*/
+!*.h
+!*.c
+!*.cc
+!*.S
+!Makefile
+!README
+!README.md
+!LICENSE
+.*
+_*
+*~
+build/
+!.gitignore
--- a/am-kernels/CMakeLists.txt
+++ b/am-kernels/CMakeLists.txt
@ -0,0 +1,16 @@
+cmake_minimum_required(VERSION 3.22)
+
+project(am-kernels)
+set(CMAKE_C_STANDARD 11)
+enable_language(C ASM)
+
+include(CheckPIESupported)
+check_pie_supported()
+
+if(${PLATFORM} MATCHES "native")
+set(ARCH "native")
+else()
+set(ARCH ${ISA}-${PLATFORM})
+endif()
+
+add_subdirectory(tests/cpu-tests)
--- a/am-kernels/LICENSE
+++ b/am-kernels/LICENSE
@ -0,0 +1,22 @@
+The AbstractMachine software is:
+
+Copyright (c) 2018-2020 Yanyan Jiang and Zihao Yu
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/am-kernels/README
+++ b/am-kernels/README
@ -0,0 +1,6 @@
+AbstractMachine kernels
+
+CONTACTS
+
+Bug reports and suggestions go to Yanyan Jiang (jyy@nju.edu.cn) and Zihao 
+Yu (yuzihao@ict.ac.cn).
--- a/am-kernels/benchmarks/coremark/Makefile
+++ b/am-kernels/benchmarks/coremark/Makefile
@ -0,0 +1,3 @@
+NAME = coremark
+SRCS = $(shell find src/ -name "*.c")
+include $(AM_HOME)/Makefile
--- a/am-kernels/benchmarks/coremark/README.md
+++ b/am-kernels/benchmarks/coremark/README.md
@ -0,0 +1,231 @@
+# Coremark
+
+'''
+File: CoreMark
+
+Topic: Welcome
+Copyright <20> 2009 EEMBC All rights reserved. 
+CoreMark is a trademark of EEMBC and EEMBC is a registered trademark of the Embedded Microprocessor Benchmark Consortium.
+
+CoreMark<EFBFBD>s primary goals are simplicity and providing a method for testing only a processor<6F>s core features. 
+
+For more information about EEMBC's comprehensive embedded benchmark suites, please see www.eembc.org.
+
+Topic: Building and running
+	Download the release files from the www.coremark.org.
+	You can verify the download using the coremark_<version>.md5 file
+	> md5sum -c coremark_<version>.md5
+	
+	Unpack the distribution (tar -vzxf coremark_<version>.tgz && tar -vzxf coremark_<version>_docs.tgz) 
+	then change to the coremark_<version> folder.
+	
+	To build and run the benchmark, type 
+	> make
+	Full results are available in the files run1.log and run2.log.
+	CoreMark result can be found in run1.log.
+	
+	For self hosted Linux or Cygwin platforms, a simple make should work.
+	
+	Cross Compile:
+	For cross compile platforms please adjust <core_portme.mak>, <core_portme.h> (and possibly <core_portme.c>) 
+	according to the specific platform used.
+	When porting to a new platform, it is recommended to copy one of the default port folders 
+	(e.g. mkdir <platform> && cp linux/* <platform>), adjust the porting files, and run 
+	> make PORT_DIR=<platform>
+	
+	Systems without make:
+	The following files need to be compiled:
+	- <core_list_join.c> 
+	- <core_main.c> 
+	- <core_matrix.c> 
+	- <core_state.c>	
+	- <core_util.c>	
+	- <PORT_DIR>/<core_portme.c>
+	
+	For example
+	> gcc -O2 -o coremark.exe core_list_join.c core_main.c core_matrix.c core_state.c core_util.c simple/core_portme.c -DPERFORMANCE_RUN=1 -DITERATIONS=1000 
+	> ./coremark.exe > run1.log
+	The above will compile the benchmark for a performance run and 1000 iterations. Output is redirected to run1.log.
+	
+	Make targets:
+	run - Default target, creates run1.log and run2.log.
+	run1.log - Run the benchmark with performance parameters, and output to run1.log
+	run2.log - Run the benchmark with validation parameters, and output to run2.log
+	run3.log - Run the benchmark with profile generation parameters, and output to run3.log
+	compile - compile the benchmark executable 
+	link - link the benchmark executable
+	check - test MD5 of sources that may not be modified
+	clean - clean temporary files
+	
+	ITERATIONS: 
+	By default, the benchmark will run between 10-100 seconds.
+	To override, use ITERATIONS=N
+	> make ITERATIONS=10 
+	Will run the benchmark for 10 iterations. 
+	It is recommended to set a specific number of iterations in certain situations e.g.:
+	- Running with a simulator
+	- Measuring power/energy
+	- Timing cannot be restarted
+	
+	Minimum required run time: 
+	Results are only valid for reporting if the benchmark ran for at least 10 secs!
+	
+	XCFLAGS:
+	To add compiler flags from the command line, use XCFLAGS e.g.
+	> make XCFLAGS="-g -DMULTITHREAD=4 -DUSE_FORK=1"
+	
+	o CORE_DEBUG
+	
+	Define to compile for a debug run if you get incorrect CRC.
+	> make XCFLAGS="-DCORE_DEBUG=1"
+	
+	o Parallel Execution
+	
+	Use XCFLAGS=-DMULTITHREAD=N where N is number of threads to run in parallel.
+	Several implementations are available to execute in multiple contexts,
+	or you can implement your own in <core_portme.c>.
+	> make XCFLAGS="-DMULTITHREAD=4 -DUSE_PTHREAD" 
+	Above will compile the benchmark for execution on 4 cores, using POSIX Threads API.
+	
+	REBUILD:
+	To force rebuild, add the flag REBUILD to the command line
+	> make REBUILD=1
+	
+	Check core_portme.mak for more important options.
+
+	Run parameters for the benchmark executable:
+	Coremark executable takes several parameters as follows (if main accepts arguments).
+	1st - A seed value used for initialization of data.
+	2nd - A seed value used for initialization of data.
+	3rd - A seed value used for initialization of data.
+	4th - Number of iterations (0 for auto : default value)
+	5th - Reserved for internal use. 
+	6th - Reserved for internal use. 
+	7th - For malloc users only, ovreride the size of the input data buffer.
+	
+	The run target from make will run coremark with 2 different data initialization seeds.
+
+	Alternative parameters: 
+	If not using malloc or command line arguments are not supported, the buffer size
+	for the algorithms must be defined via the compiler define TOTAL_DATA_SIZE.
+	TOTAL_DATA_SIZE must be set to 2000 bytes (default) for standard runs.
+	The default for such a target when testing different configurations could be ...
+	> make XCFLAGS="-DTOTAL_DATA_SIZE=6000 -DMAIN_HAS_NOARGC=1"
+	
+Topic: Documentation
+	When you unpack the documentation (tar -vzxf coremark_<version>_docs.tgz) a docs folder will be created.
+	Check the file docs/html/index.html and the website http://www.coremark.org for more info.
+	
+Topic: Submitting results
+	CoreMark results can be submitted on the web.
+	
+	Open a web browser and go to http://www.coremark.org/benchmark/index.php?pg=benchmark
+	Select the link to add a new score and follow the instructions.
+	
+Topic: Run rules
+	What is and is not allowed.
+	
+	Required:
+	1 - The benchmark needs to run for at least 10 seconds.
+	2 - All validation must succeed for seeds 0,0,0x66 and 0x3415,0x3415,0x66, 
+		buffer size of 2000 bytes total.
+		o If not using command line arguments to main:
+		> make XCFLAGS="-DPERFORMANCE_RUN=1" REBUILD=1 run1.log
+		> make XCFLAGS="-DVALIDATION_RUN=1" REBUILD=1 run2.log
+	3 - If using profile guided optimization, profile must be generated using seeds of 8,8,8,
+		and buffer size of 1200 bytes total.
+		> make XCFLAGS="-DTOTAL_DATA_SIZE=1200 -DPROFILE_RUN=1" REBUILD=1 run3.log
+	4 - All source files must be compiled with the same flags.
+	5 - All data type sizes must match size in bits such that:
+		o ee_u8 is an 8 bits datatype.
+		o ee_s16 is an 16 bits datatype.
+		o ee_u16 is an 16 bits datatype.
+		o ee_s32 is an 32 bits datatype.
+		o ee_u32 is an 32 bits datatype.
+	
+	Allowed:
+	- Changing number of iterations
+	- Changing toolchain and build/load/run options
+	- Changing method of acquiring a data memory block
+	- Changing the method of acquiring seed values
+	- Changing implementation in core_portme.c
+	- Changing configuration values in core_portme.h
+	- Changing core_portme.mak
+	
+	Not allowed:
+	- Changing of source file other then core_portme* (use make check to validate)
+
+Topic: Reporting rules
+	How to report results on a data sheet?
+
+	CoreMark 1.0 : N / C [/ P] [/ M]
+	
+	N - Number of iterations per second with seeds 0,0,0x66,size=2000)
+	C - Compiler version and flags
+	P - Parameters such as data and code allocation specifics
+		- This parameter *may* be omitted if all data was allocated on the heap in RAM.
+		- This parameter *may not* be omitted when reporting CoreMark/MHz
+	M - Type of parallel execution (if used) and number of contexts
+		This parameter may be omitted if parallel execution was not used.
+
+	e.g. 
+	> CoreMark 1.0 : 128 / GCC 4.1.2 -O2 -fprofile-use / Heap in TCRAM / FORK:2 
+	or
+	> CoreMark 1.0 : 1400 / GCC 3.4 -O4 
+	
+	If reporting scaling results, the results must be reported as follows:
+	
+	CoreMark/MHz 1.0 : N / C / P [/ M]
+	
+	P - When reporting scaling results, memory parameter must also indicate memory frequency:core frequency ratio.
+		- If the core has cache and cache frequency to core frequency ratio is configurable, that must also be included.
+	
+	e.g.
+	> CoreMark/MHz 1.0 : 1.47 / GCC 4.1.2 -O2 / DDR3(Heap) 30:1 Memory 1:1 Cache
+
+	
+Topic: Log File Format
+	The log files have the following format
+(start example)
+2K performance run parameters for coremark.	(Run type)
+CoreMark Size    	: 666					(Buffer size)
+Total ticks			: 25875					(platform dependent value)
+Total time (secs) 	: 25.875000				(actual time in seconds)
+Iterations/Sec 		: 3864.734300			(Performance value to report)
+Iterations			: 100000				(number of iterations used)
+Compiler version	: GCC3.4.4				(Compiler and version)	
+Compiler flags		: -O2					(Compiler and linker flags)
+Memory location		: Code in flash, data in on chip RAM
+seedcrc				: 0xe9f5				(identifier for the input seeds)
+[0]crclist			: 0xe714				(validation for list part)
+[0]crcmatrix		: 0x1fd7				(validation for matrix part)
+[0]crcstate			: 0x8e3a				(validation for state part)
+[0]crcfinal			: 0x33ff				(iteration dependent output)
+Correct operation validated. See readme.txt for run and reporting rules.  (*Only when run is successful*)
+CoreMark 1.0 : 6508.490622 / GCC3.4.4 -O2 / Heap 						  (*Only on a successful performance run*)		
+(end example)
+
+Topic: Legal
+See LICENSE.txt or the word document file under docs/LICENSE.doc.
+For more information on your legal rights to use this benchmark, please see
+http://www.coremark.org/download/register.php?pg=register	
+
+Topic: Credits
+Many thanks to all of the individuals who helped with the development or testing of CoreMark including (Sorted by company name)
+o Alan Anderson, ADI
+o Adhikary Rajiv, ADI
+o Elena Stohr, ARM
+o Ian Rickards, ARM
+o Andrew Pickard, ARM
+o Trent Parker, CAVIUM
+o Shay Gal-On, EEMBC
+o Markus Levy, EEMBC
+o Ron Olson, IBM
+o Eyal Barzilay, MIPS
+o Jens Eltze, NEC
+o Hirohiko Ono, NEC
+o Ulrich Drees, NEC
+o Frank Roscheda, NEC
+o Rob Cosaro, NXP
+o Shumpei Kawasaki, RENESAS
+'''
--- a/am-kernels/benchmarks/coremark/include/core_portme.h
+++ b/am-kernels/benchmarks/coremark/include/core_portme.h
@ -0,0 +1,188 @@
+/* Topic : Description
+	This file contains configuration constants required to execute on different platforms
+*/
+
+
+#ifndef CORE_PORTME_H
+#define CORE_PORTME_H
+
+#include <am.h>
+#include <klib.h>
+#include <klib-macros.h>
+
+#define ITERATIONS 1000
+#define MEM_METHOD MEM_STATIC
+
+/************************/
+/* Data types and settings */
+/************************/
+/* Configuration : HAS_FLOAT
+	Define to 1 if the platform supports floating point.
+*/
+#ifndef HAS_FLOAT
+#define HAS_FLOAT 0
+#endif
+/* Configuration : HAS_TIME_H
+	Define to 1 if platform has the time.h header file,
+	and implementation of functions thereof.
+*/
+#ifndef HAS_TIME_H
+#define HAS_TIME_H 0
+#endif
+/* Configuration : USE_CLOCK
+	Define to 1 if platform has the time.h header file,
+	and implementation of functions thereof.
+*/
+#ifndef USE_CLOCK
+#define USE_CLOCK 0
+#endif
+/* Configuration : HAS_STDIO
+	Define to 1 if the platform has stdio.h.
+*/
+#ifndef HAS_STDIO
+#define HAS_STDIO 0
+#endif
+/* Configuration : HAS_PRINTF
+	Define to 1 if the platform has stdio.h and implements the printf function.
+*/
+#ifndef HAS_PRINTF
+#define HAS_PRINTF 1
+#endif
+
+/* Configuration : CORE_TICKS
+	Define type of return from the timing functions.
+ */
+typedef uint32_t CORE_TICKS;
+
+/* Definitions : COMPILER_VERSION, COMPILER_FLAGS, MEM_LOCATION
+	Initialize these strings per platform
+*/
+#ifndef COMPILER_VERSION
+ #ifdef __GNUC__
+ #define COMPILER_VERSION "GCC"__VERSION__
+ #else
+ #define COMPILER_VERSION "Please put compiler version here (e.g. gcc 4.1)"
+ #endif
+#endif
+#ifndef COMPILER_FLAGS
+ #define COMPILER_FLAGS
+#endif
+#ifndef MEM_LOCATION
+ #define MEM_LOCATION "STACK"
+#endif
+
+/* Data Types :
+	To avoid compiler issues, define the data types that need ot be used for 8b, 16b and 32b in <core_portme.h>.
+
+	*Imprtant* :
+	ee_ptr_int needs to be the data type used to hold pointers, otherwise coremark may fail!!!
+*/
+typedef signed short ee_s16;
+typedef unsigned short ee_u16;
+typedef signed int ee_s32;
+typedef double ee_f32;
+typedef unsigned char ee_u8;
+typedef unsigned int ee_u32;
+typedef unsigned long ee_ptr_int;
+typedef size_t ee_size_t;
+/* align_mem :
+	This macro is used to align an offset to point to a 32b value. It is used in the Matrix algorithm to initialize the input memory blocks.
+*/
+#define align_mem(x) (void *)(4 + (((unsigned long)(x) - 1) & ~3))
+
+/* Configuration : SEED_METHOD
+	Defines method to get seed values that cannot be computed at compile time.
+
+	Valid values :
+	SEED_ARG - from command line.
+	SEED_FUNC - from a system function.
+	SEED_VOLATILE - from volatile variables.
+*/
+#ifndef SEED_METHOD
+#define SEED_METHOD SEED_VOLATILE
+#endif
+
+/* Configuration : MEM_METHOD
+	Defines method to get a block of memry.
+
+	Valid values :
+	MEM_MALLOC - for platforms that implement malloc and have malloc.h.
+	MEM_STATIC - to use a static memory array.
+	MEM_STACK - to allocate the data block on the stack (NYI).
+*/
+#ifndef MEM_METHOD
+#define MEM_METHOD MEM_STACK
+#endif
+
+/* Configuration : MULTITHREAD
+	Define for parallel execution
+
+	Valid values :
+	1 - only one context (default).
+	N>1 - will execute N copies in parallel.
+
+	Note :
+	If this flag is defined to more then 1, an implementation for launching parallel contexts must be defined.
+
+	Two sample implementations are provided. Use <USE_PTHREAD> or <USE_FORK> to enable them.
+
+	It is valid to have a different implementation of <core_start_parallel> and <core_end_parallel> in <core_portme.c>,
+	to fit a particular architecture.
+*/
+#ifndef MULTITHREAD
+#define MULTITHREAD 1
+#define USE_PTHREAD 0
+#define USE_FORK 0
+#define USE_SOCKET 0
+#endif
+
+/* Configuration : MAIN_HAS_NOARGC
+	Needed if platform does not support getting arguments to main.
+
+	Valid values :
+	0 - argc/argv to main is supported
+	1 - argc/argv to main is not supported
+
+	Note :
+	This flag only matters if MULTITHREAD has been defined to a value greater then 1.
+*/
+#ifndef MAIN_HAS_NOARGC
+#define MAIN_HAS_NOARGC 0
+#endif
+
+/* Configuration : MAIN_HAS_NORETURN
+	Needed if platform does not support returning a value from main.
+
+	Valid values :
+	0 - main returns an int, and return value will be 0.
+	1 - platform does not support returning a value from main
+*/
+#ifndef MAIN_HAS_NORETURN
+#define MAIN_HAS_NORETURN 0
+#endif
+
+/* Variable : default_num_contexts
+	Not used for this simple port, must cintain the value 1.
+*/
+extern ee_u32 default_num_contexts;
+
+typedef struct CORE_PORTABLE_S {
+	ee_u8	portable_id;
+} core_portable;
+
+/* target specific init/fini */
+void portable_init(core_portable *p, int *argc, char *argv[]);
+void portable_fini(core_portable *p);
+
+#if !defined(PROFILE_RUN) && !defined(PERFORMANCE_RUN) && !defined(VALIDATION_RUN)
+#if (TOTAL_DATA_SIZE==1200)
+#define PROFILE_RUN 1
+#elif (TOTAL_DATA_SIZE==2000)
+#define PERFORMANCE_RUN 1
+#else
+#define VALIDATION_RUN 1
+#endif
+#endif
+
+
+#endif /* CORE_PORTME_H */
--- a/am-kernels/benchmarks/coremark/include/coremark.h
+++ b/am-kernels/benchmarks/coremark/include/coremark.h
@ -0,0 +1,174 @@
+/*
+Author : Shay Gal-On, EEMBC
+
+This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
+All rights reserved.
+
+EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
+CoreMark License that is distributed with the official EEMBC COREMARK Software release.
+If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
+you must discontinue use and download the official release from www.coremark.org.
+
+Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
+make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
+
+EEMBC
+4354 Town Center Blvd. Suite 114-200
+El Dorado Hills, CA, 95762
+*/
+/* Topic: Description
+	This file contains  declarations of the various benchmark functions.
+*/
+
+/* Configuration: TOTAL_DATA_SIZE
+	Define total size for data algorithms will operate on
+*/
+#ifndef TOTAL_DATA_SIZE
+#define TOTAL_DATA_SIZE 2*1000
+#endif
+
+#define SEED_ARG 0
+#define SEED_FUNC 1
+#define SEED_VOLATILE 2
+
+#define MEM_STATIC 0
+#define MEM_MALLOC 1
+#define MEM_STACK 2
+
+#include "core_portme.h"
+
+#if HAS_STDIO
+#include <stdio.h>
+#endif
+#if HAS_PRINTF
+#define ee_printf printf
+#endif
+
+/* Actual benchmark execution in iterate */
+void *iterate(void *pres);
+
+/* Typedef: secs_ret
+	For machines that have floating point support, get number of seconds as a double.
+	Otherwise an unsigned int.
+*/
+#if HAS_FLOAT
+typedef double secs_ret;
+#else
+typedef ee_u32 secs_ret;
+#endif
+
+#if MAIN_HAS_NORETURN
+#define MAIN_RETURN_VAL
+#define MAIN_RETURN_TYPE void
+#else
+#define MAIN_RETURN_VAL 0
+#define MAIN_RETURN_TYPE int
+#endif
+
+void start_time(void);
+void stop_time(void);
+CORE_TICKS get_time(void);
+secs_ret time_in_secs(CORE_TICKS ticks);
+
+/* Misc useful functions */
+ee_u16 crcu8(ee_u8 data, ee_u16 crc);
+ee_u16 crc16(ee_s16 newval, ee_u16 crc);
+ee_u16 crcu16(ee_u16 newval, ee_u16 crc);
+ee_u16 crcu32(ee_u32 newval, ee_u16 crc);
+ee_u8 check_data_types();
+void *portable_malloc(ee_size_t size);
+void portable_free(void *p);
+ee_s32 parseval(char *valstring);
+
+/* Algorithm IDS */
+#define ID_LIST 	(1<<0)
+#define ID_MATRIX 	(1<<1)
+#define ID_STATE 	(1<<2)
+#define ALL_ALGORITHMS_MASK (ID_LIST|ID_MATRIX|ID_STATE)
+#define NUM_ALGORITHMS 3
+
+/* list data structures */
+typedef struct list_data_s {
+	ee_s16 data16;
+	ee_s16 idx;
+} list_data;
+
+typedef struct list_head_s {
+	struct list_head_s *next;
+	struct list_data_s *info;
+} list_head;
+
+
+/*matrix benchmark related stuff */
+#define MATDAT_INT 1
+#if MATDAT_INT
+typedef ee_s16 MATDAT;
+typedef ee_s32 MATRES;
+#else
+typedef ee_f16 MATDAT;
+typedef ee_f32 MATRES;
+#endif
+
+typedef struct MAT_PARAMS_S {
+	int N;
+	MATDAT *A;
+	MATDAT *B;
+	MATRES *C;
+} mat_params;
+
+/* state machine related stuff */
+/* List of all the possible states for the FSM */
+typedef enum CORE_STATE {
+	CORE_START=0,
+	CORE_INVALID,
+	CORE_S1,
+	CORE_S2,
+	CORE_INT,
+	CORE_FLOAT,
+	CORE_EXPONENT,
+	CORE_SCIENTIFIC,
+	NUM_CORE_STATES
+} core_state_e ;
+
+
+/* Helper structure to hold results */
+typedef struct RESULTS_S {
+	/* inputs */
+	ee_s16	seed1;		/* Initializing seed */
+	ee_s16	seed2;		/* Initializing seed */
+	ee_s16	seed3;		/* Initializing seed */
+	void	*memblock[4];	/* Pointer to safe memory location */
+	ee_u32	size;		/* Size of the data */
+	ee_u32 iterations;		/* Number of iterations to execute */
+	ee_u32	execs;		/* Bitmask of operations to execute */
+	struct list_head_s *list;
+	mat_params mat;
+	/* outputs */
+	ee_u16	crc;
+	ee_u16	crclist;
+	ee_u16	crcmatrix;
+	ee_u16	crcstate;
+	ee_s16	err;
+	/* ultithread specific */
+	core_portable port;
+} core_results;
+
+/* Multicore execution handling */
+#if (MULTITHREAD>1)
+ee_u8 core_start_parallel(core_results *res);
+ee_u8 core_stop_parallel(core_results *res);
+#endif
+
+/* list benchmark functions */
+list_head *core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed);
+ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx);
+
+/* state benchmark functions */
+void core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p);
+ee_u16 core_bench_state(ee_u32 blksize, ee_u8 *memblock,
+		ee_s16 seed1, ee_s16 seed2, ee_s16 step, ee_u16 crc);
+
+/* matrix benchmark functions */
+ee_u32 core_init_matrix(ee_u32 blksize, void *memblk, ee_s32 seed, mat_params *p);
+ee_u16 core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc);
+
--- a/am-kernels/benchmarks/coremark/src/core_list_join.c
+++ b/am-kernels/benchmarks/coremark/src/core_list_join.c
@ -0,0 +1,496 @@
+/*
+Author : Shay Gal-On, EEMBC
+
+This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
+All rights reserved.
+
+EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
+CoreMark License that is distributed with the official EEMBC COREMARK Software release.
+If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
+you must discontinue use and download the official release from www.coremark.org.
+
+Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
+make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
+
+EEMBC
+4354 Town Center Blvd. Suite 114-200
+El Dorado Hills, CA, 95762
+*/
+
+#include "coremark.h"
+/*
+Topic: Description
+	Benchmark using a linked list.
+
+	Linked list is a common data structure used in many applications.
+
+	For our purposes, this will excercise the memory units of the processor.
+	In particular, usage of the list pointers to find and alter data.
+
+	We are not using Malloc since some platforms do not support this library.
+
+	Instead, the memory block being passed in is used to create a list,
+	and the benchmark takes care not to add more items then can be
+	accomodated by the memory block. The porting layer will make sure
+	that we have a valid memory block.
+
+	All operations are done in place, without using any extra memory.
+
+	The list itself contains list pointers and pointers to data items.
+	Data items contain the following:
+
+	idx - An index that captures the initial order of the list.
+	data - Variable data initialized based on the input parameters. The 16b are divided as follows:
+	o Upper 8b are backup of original data.
+	o Bit 7 indicates if the lower 7 bits are to be used as is or calculated.
+	o Bits 0-2 indicate type of operation to perform to get a 7b value.
+	o Bits 3-6 provide input for the operation.
+
+*/
+
+/* local functions */
+
+list_head *core_list_find(list_head *list,list_data *info);
+list_head *core_list_reverse(list_head *list);
+list_head *core_list_remove(list_head *item);
+list_head *core_list_undo_remove(list_head *item_removed, list_head *item_modified);
+list_head *core_list_insert_new(list_head *insert_point
+	, list_data *info, list_head **memblock, list_data **datablock
+	, list_head *memblock_end, list_data *datablock_end);
+typedef ee_s32(*list_cmp)(list_data *a, list_data *b, core_results *res);
+list_head *core_list_mergesort(list_head *list, list_cmp cmp, core_results *res);
+
+ee_s16 calc_func(ee_s16 *pdata, core_results *res) {
+	ee_s16 data=*pdata;
+	ee_s16 retval;
+	ee_u8 optype=(data>>7) & 1; /* bit 7 indicates if the function result has been cached */
+	if (optype) /* if cached, use cache */
+		return (data & 0x007f);
+	else { /* otherwise calculate and cache the result */
+		ee_s16 flag=data & 0x7; /* bits 0-2 is type of function to perform */
+		ee_s16 dtype=((data>>3) & 0xf); /* bits 3-6 is specific data for the operation */
+		dtype |= dtype << 4; /* replicate the lower 4 bits to get an 8b value */
+		switch (flag) {
+			case 0:
+				if (dtype<0x22) /* set min period for bit corruption */
+					dtype=0x22;
+				retval=core_bench_state(res->size,res->memblock[3],res->seed1,res->seed2,dtype,res->crc);
+				if (res->crcstate==0)
+					res->crcstate=retval;
+				break;
+			case 1:
+				retval=core_bench_matrix(&(res->mat),dtype,res->crc);
+				if (res->crcmatrix==0)
+					res->crcmatrix=retval;
+				break;
+			default:
+				retval=data;
+				break;
+		}
+		res->crc=crcu16(retval,res->crc);
+		retval &= 0x007f;
+		*pdata = (data & 0xff00) | 0x0080 | retval; /* cache the result */
+		return retval;
+	}
+}
+/* Function: cmp_complex
+	Compare the data item in a list cell.
+
+	Can be used by mergesort.
+*/
+ee_s32 cmp_complex(list_data *a, list_data *b, core_results *res) {
+	ee_s16 val1=calc_func(&(a->data16),res);
+	ee_s16 val2=calc_func(&(b->data16),res);
+	return val1 - val2;
+}
+
+/* Function: cmp_idx
+	Compare the idx item in a list cell, and regen the data.
+
+	Can be used by mergesort.
+*/
+ee_s32 cmp_idx(list_data *a, list_data *b, core_results *res) {
+	if (res==NULL) {
+		a->data16 = (a->data16 & 0xff00) | (0x00ff & (a->data16>>8));
+		b->data16 = (b->data16 & 0xff00) | (0x00ff & (b->data16>>8));
+	}
+	return a->idx - b->idx;
+}
+
+void copy_info(list_data *to,list_data *from) {
+	to->data16=from->data16;
+	to->idx=from->idx;
+}
+
+/* Benchmark for linked list:
+	- Try to find multiple data items.
+	- List sort
+	- Operate on data from list (crc)
+	- Single remove/reinsert
+	* At the end of this function, the list is back to original state
+*/
+ee_u16 core_bench_list(core_results *res, ee_s16 finder_idx) {
+	ee_u16 retval=0;
+	ee_u16 found=0,missed=0;
+	list_head *list=res->list;
+	ee_s16 find_num=res->seed3;
+	list_head *this_find;
+	list_head *finder, *remover;
+	list_data info = {};
+	ee_s16 i;
+
+	info.idx=finder_idx;
+	/* find <find_num> values in the list, and change the list each time (reverse and cache if value found) */
+	for (i=0; i<find_num; i++) {
+		info.data16= (i & 0xff) ;
+		this_find=core_list_find(list,&info);
+		list=core_list_reverse(list);
+		if (this_find==NULL) {
+			missed++;
+			retval+=(list->next->info->data16 >> 8) & 1;
+		}
+		else {
+			found++;
+			if (this_find->info->data16 & 0x1) /* use found value */
+				retval+=(this_find->info->data16 >> 9) & 1;
+			/* and cache next item at the head of the list (if any) */
+			if (this_find->next != NULL) {
+				finder = this_find->next;
+				this_find->next = finder->next;
+				finder->next=list->next;
+				list->next=finder;
+			}
+		}
+		if (info.idx>=0)
+			info.idx++;
+#if CORE_DEBUG
+	ee_printf("List find %d: [%d,%d,%d]\n",i,retval,missed,found);
+#endif
+	}
+	retval+=found*4-missed;
+	/* sort the list by data content and remove one item*/
+	if (finder_idx>0)
+		list=core_list_mergesort(list,cmp_complex,res);
+	remover=core_list_remove(list->next);
+	/* CRC data content of list from location of index N forward, and then undo remove */
+	finder=core_list_find(list,&info);
+	if (!finder)
+		finder=list->next;
+	while (finder) {
+		retval=crc16(list->info->data16,retval);
+		finder=finder->next;
+	}
+#if CORE_DEBUG
+	ee_printf("List sort 1: %04x\n",retval);
+#endif
+	remover=core_list_undo_remove(remover,list->next);
+	/* sort the list by index, in effect returning the list to original state */
+	list=core_list_mergesort(list,cmp_idx,NULL);
+	/* CRC data content of list */
+	finder=list->next;
+	while (finder) {
+		retval=crc16(list->info->data16,retval);
+		finder=finder->next;
+	}
+#if CORE_DEBUG
+	ee_printf("List sort 2: %04x\n",retval);
+#endif
+	return retval;
+}
+/* Function: core_list_init
+	Initialize list with data.
+
+	Parameters:
+	blksize - Size of memory to be initialized.
+	memblock - Pointer to memory block.
+	seed - 	Actual values chosen depend on the seed parameter.
+		The seed parameter MUST be supplied from a source that cannot be determined at compile time
+
+	Returns:
+	Pointer to the head of the list.
+
+*/
+list_head *core_list_init(ee_u32 blksize, list_head *memblock, ee_s16 seed) {
+	/* calculated pointers for the list */
+	ee_u32 per_item=16+sizeof(struct list_data_s);
+	ee_u32 size=(blksize/per_item)-2; /* to accomodate systems with 64b pointers, and make sure same code is executed, set max list elements */
+	list_head *memblock_end=memblock+size;
+	list_data *datablock=(list_data *)(memblock_end);
+	list_data *datablock_end=datablock+size;
+	/* some useful variables */
+	ee_u32 i;
+	list_head *finder,*list=memblock;
+	list_data info;
+
+	/* create a fake items for the list head and tail */
+	list->next=NULL;
+	list->info=datablock;
+	list->info->idx=0x0000;
+	list->info->data16=(ee_s16)0x8080;
+	memblock++;
+	datablock++;
+	info.idx=0x7fff;
+	info.data16=(ee_s16)0xffff;
+	core_list_insert_new(list,&info,&memblock,&datablock,memblock_end,datablock_end);
+
+	/* then insert size items */
+	for (i=0; i<size; i++) {
+		ee_u16 datpat=((ee_u16)(seed^i) & 0xf);
+		ee_u16 dat=(datpat<<3) | (i&0x7); /* alternate between algorithms */
+		info.data16=(dat<<8) | dat;		/* fill the data with actual data and upper bits with rebuild value */
+		core_list_insert_new(list,&info,&memblock,&datablock,memblock_end,datablock_end);
+	}
+	/* and now index the list so we know initial seed order of the list */
+	finder=list->next;
+	i=1;
+	while (finder->next!=NULL) {
+		if (i<size/5) /* first 20% of the list in order */
+			finder->info->idx=i++;
+		else {
+			ee_u16 pat=(ee_u16)(i++ ^ seed); /* get a pseudo random number */
+			finder->info->idx=0x3fff & (((i & 0x07) << 8) | pat); /* make sure the mixed items end up after the ones in sequence */
+		}
+		finder=finder->next;
+	}
+	list = core_list_mergesort(list,cmp_idx,NULL);
+#if CORE_DEBUG
+	ee_printf("Initialized list:\n");
+	finder=list;
+	while (finder) {
+		ee_printf("[%04x,%04x]",finder->info->idx,(ee_u16)finder->info->data16);
+		finder=finder->next;
+	}
+	ee_printf("\n");
+#endif
+	return list;
+}
+
+/* Function: core_list_insert
+	Insert an item to the list
+
+	Parameters:
+	insert_point - where to insert the item.
+	info - data for the cell.
+	memblock - pointer for the list header
+	datablock - pointer for the list data
+	memblock_end - end of region for list headers
+	datablock_end - end of region for list data
+
+	Returns:
+	Pointer to new item.
+*/
+list_head *core_list_insert_new(list_head *insert_point, list_data *info, list_head **memblock, list_data **datablock
+	, list_head *memblock_end, list_data *datablock_end) {
+	list_head *newitem;
+
+	if ((*memblock+1) >= memblock_end)
+		return NULL;
+	if ((*datablock+1) >= datablock_end)
+		return NULL;
+
+	newitem=*memblock;
+	(*memblock)++;
+	newitem->next=insert_point->next;
+	insert_point->next=newitem;
+
+	newitem->info=*datablock;
+	(*datablock)++;
+	copy_info(newitem->info,info);
+
+	return newitem;
+}
+
+/* Function: core_list_remove
+	Remove an item from the list.
+
+	Operation:
+	For a singly linked list, remove by copying the data from the next item
+	over to the current cell, and unlinking the next item.
+
+	Note:
+	since there is always a fake item at the end of the list, no need to check for NULL.
+
+	Returns:
+	Removed item.
+*/
+list_head *core_list_remove(list_head *item) {
+	list_data *tmp;
+	list_head *ret=item->next;
+	/* swap data pointers */
+	tmp=item->info;
+	item->info=ret->info;
+	ret->info=tmp;
+	/* and eliminate item */
+	item->next=item->next->next;
+	ret->next=NULL;
+	return ret;
+}
+
+/* Function: core_list_undo_remove
+	Undo a remove operation.
+
+	Operation:
+	Since we want each iteration of the benchmark to be exactly the same,
+	we need to be able to undo a remove.
+	Link the removed item back into the list, and switch the info items.
+
+	Parameters:
+	item_removed - Return value from the <core_list_remove>
+	item_modified - List item that was modified during <core_list_remove>
+
+	Returns:
+	The item that was linked back to the list.
+
+*/
+list_head *core_list_undo_remove(list_head *item_removed, list_head *item_modified) {
+	list_data *tmp;
+	/* swap data pointers */
+	tmp=item_removed->info;
+	item_removed->info=item_modified->info;
+	item_modified->info=tmp;
+	/* and insert item */
+	item_removed->next=item_modified->next;
+	item_modified->next=item_removed;
+	return item_removed;
+}
+
+/* Function: core_list_find
+	Find an item in the list
+
+	Operation:
+	Find an item by idx (if not 0) or specific data value
+
+	Parameters:
+	list - list head
+	info - idx or data to find
+
+	Returns:
+	Found item, or NULL if not found.
+*/
+list_head *core_list_find(list_head *list,list_data *info) {
+	if (info->idx>=0) {
+		while (list && (list->info->idx != info->idx))
+			list=list->next;
+		return list;
+	} else {
+		while (list && ((list->info->data16 & 0xff) != info->data16))
+			list=list->next;
+		return list;
+	}
+}
+/* Function: core_list_reverse
+	Reverse a list
+
+	Operation:
+	Rearrange the pointers so the list is reversed.
+
+	Parameters:
+	list - list head
+	info - idx or data to find
+
+	Returns:
+	Found item, or NULL if not found.
+*/
+
+list_head *core_list_reverse(list_head *list) {
+	list_head *next=NULL, *tmp;
+	while (list) {
+		tmp=list->next;
+		list->next=next;
+		next=list;
+		list=tmp;
+	}
+	return next;
+}
+/* Function: core_list_mergesort
+	Sort the list in place without recursion.
+
+	Description:
+	Use mergesort, as for linked list this is a realistic solution.
+	Also, since this is aimed at embedded, care was taken to use iterative rather then recursive algorithm.
+	The sort can either return the list to original order (by idx) ,
+	or use the data item to invoke other other algorithms and change the order of the list.
+
+	Parameters:
+	list - list to be sorted.
+	cmp - cmp function to use
+
+	Returns:
+	New head of the list.
+
+	Note:
+	We have a special header for the list that will always be first,
+	but the algorithm could theoretically modify where the list starts.
+
+ */
+list_head *core_list_mergesort(list_head *list, list_cmp cmp, core_results *res) {
+    list_head *p, *q, *e, *tail;
+    ee_s32 insize, nmerges, psize, qsize, i;
+
+    insize = 1;
+
+    while (1) {
+        p = list;
+        list = NULL;
+        tail = NULL;
+
+        nmerges = 0;  /* count number of merges we do in this pass */
+
+        while (p) {
+            nmerges++;  /* there exists a merge to be done */
+            /* step `insize' places along from p */
+            q = p;
+            psize = 0;
+            for (i = 0; i < insize; i++) {
+                psize++;
+			    q = q->next;
+                if (!q) break;
+            }
+
+            /* if q hasn't fallen off end, we have two lists to merge */
+            qsize = insize;
+
+            /* now we have two lists; merge them */
+            while (psize > 0 || (qsize > 0 && q)) {
+
+				/* decide whether next element of merge comes from p or q */
+				if (psize == 0) {
+				    /* p is empty; e must come from q. */
+				    e = q; q = q->next; qsize--;
+				} else if (qsize == 0 || !q) {
+				    /* q is empty; e must come from p. */
+				    e = p; p = p->next; psize--;
+				} else if (cmp(p->info,q->info,res) <= 0) {
+				    /* First element of p is lower (or same); e must come from p. */
+				    e = p; p = p->next; psize--;
+				} else {
+				    /* First element of q is lower; e must come from q. */
+				    e = q; q = q->next; qsize--;
+				}
+
+		        /* add the next element to the merged list */
+				if (tail) {
+				    tail->next = e;
+				} else {
+				    list = e;
+				}
+				tail = e;
+	        }
+
+			/* now p has stepped `insize' places along, and q has too */
+			p = q;
+        }
+
+	    tail->next = NULL;
+
+        /* If we have done only one merge, we're finished. */
+        if (nmerges <= 1)   /* allow for nmerges==0, the empty list case */
+            return list;
+
+        /* Otherwise repeat, merging lists twice the size */
+        insize *= 2;
+    }
+#if COMPILER_REQUIRES_SORT_RETURN
+	return list;
+#endif
+}
--- a/am-kernels/benchmarks/coremark/src/core_main.c
+++ b/am-kernels/benchmarks/coremark/src/core_main.c
@ -0,0 +1,339 @@
+/*
+Author : Shay Gal-On, EEMBC
+
+This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
+All rights reserved.
+
+EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
+CoreMark License that is distributed with the official EEMBC COREMARK Software release.
+If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
+you must discontinue use and download the official release from www.coremark.org.
+
+Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
+make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
+
+EEMBC
+4354 Town Center Blvd. Suite 114-200
+El Dorado Hills, CA, 95762
+*/
+/* File: core_main.c
+	This file contains the framework to acquire a block of memory, seed initial parameters, tun t he benchmark and report the results.
+*/
+#include "coremark.h"
+
+/* Function: iterate
+	Run the benchmark for a specified number of iterations.
+
+	Operation:
+	For each type of benchmarked algorithm:
+		a - Initialize the data block for the algorithm.
+		b - Execute the algorithm N times.
+
+	Returns:
+	NULL.
+*/
+static ee_u16 list_known_crc[]   =      {(ee_u16)0xd4b0,(ee_u16)0x3340,(ee_u16)0x6a79,(ee_u16)0xe714,(ee_u16)0xe3c1};
+static ee_u16 matrix_known_crc[] =      {(ee_u16)0xbe52,(ee_u16)0x1199,(ee_u16)0x5608,(ee_u16)0x1fd7,(ee_u16)0x0747};
+static ee_u16 state_known_crc[]  =      {(ee_u16)0x5e47,(ee_u16)0x39bf,(ee_u16)0xe5a4,(ee_u16)0x8e3a,(ee_u16)0x8d84};
+void *iterate(void *pres) {
+	ee_u32 i;
+	ee_u16 crc;
+	core_results *res=(core_results *)pres;
+	ee_u32 iterations=res->iterations;
+	res->crc=0;
+	res->crclist=0;
+	res->crcmatrix=0;
+	res->crcstate=0;
+
+	for (i=0; i<iterations; i++) {
+		crc=core_bench_list(res,1);
+		res->crc=crcu16(crc,res->crc);
+		crc=core_bench_list(res,-1);
+		res->crc=crcu16(crc,res->crc);
+		if (i==0) res->crclist=res->crc;
+	}
+	return NULL;
+}
+
+#if (SEED_METHOD==SEED_ARG)
+ee_s32 get_seed_args(int i, int argc, char *argv[]);
+#define get_seed(x) (ee_s16)get_seed_args(x,argc,argv)
+#define get_seed_32(x) get_seed_args(x,argc,argv)
+#else /* via function or volatile */
+ee_s32 get_seed_32(int i);
+#define get_seed(x) (ee_s16)get_seed_32(x)
+#endif
+
+#if (MEM_METHOD==MEM_STATIC)
+ee_u8 static_memblk[TOTAL_DATA_SIZE];
+#endif
+char *mem_name[3] = {"Static","Heap","Stack"};
+/* Function: main
+	Main entry routine for the benchmark.
+	This function is responsible for the following steps:
+
+	1 - Initialize input seeds from a source that cannot be determined at compile time.
+	2 - Initialize memory block for use.
+	3 - Run and time the benchmark.
+	4 - Report results, testing the validity of the output if the seeds are known.
+
+	Arguments:
+	1 - first seed  : Any value
+	2 - second seed : Must be identical to first for iterations to be identical
+	3 - third seed  : Any value, should be at least an order of magnitude less then the input size, but bigger then 32.
+	4 - Iterations  : Special, if set to 0, iterations will be automatically determined such that the benchmark will run between 10 to 100 secs
+
+*/
+
+#if MAIN_HAS_NOARGC
+MAIN_RETURN_TYPE main(void) {
+	int argc=0;
+	char *argv[1];
+#else
+MAIN_RETURN_TYPE main(int argc, char *argv[]) {
+#endif
+	ee_u16 i,j=0,num_algorithms=0;
+	ee_s16 known_id=-1,total_errors=0;
+	ee_u16 seedcrc=0;
+	CORE_TICKS total_time;
+	core_results results[MULTITHREAD];
+#if (MEM_METHOD==MEM_STACK)
+	ee_u8 stack_memblock[TOTAL_DATA_SIZE*MULTITHREAD];
+#endif
+
+  ioe_init();
+
+  ee_printf("Running CoreMark for %d iterations\n", ITERATIONS);
+
+	/* first call any initializations needed */
+	portable_init(&(results[0].port), &argc, argv);
+	/* First some checks to make sure benchmark will run ok */
+	if (sizeof(struct list_head_s)>128) {
+		ee_printf("list_head structure too big for comparable data!\n");
+		return MAIN_RETURN_VAL;
+	}
+	results[0].seed1=get_seed(1);
+	results[0].seed2=get_seed(2);
+	results[0].seed3=get_seed(3);
+	results[0].iterations=get_seed_32(4);
+#if CORE_DEBUG
+	results[0].iterations=1;
+#endif
+	results[0].execs=get_seed_32(5);
+	if (results[0].execs==0) { /* if not supplied, execute all algorithms */
+		results[0].execs=ALL_ALGORITHMS_MASK;
+	}
+		/* put in some default values based on one seed only for easy testing */
+	if ((results[0].seed1==0) && (results[0].seed2==0) && (results[0].seed3==0)) { /* validation run */
+		results[0].seed1=0;
+		results[0].seed2=0;
+		results[0].seed3=0x66;
+	}
+	if ((results[0].seed1==1) && (results[0].seed2==0) && (results[0].seed3==0)) { /* perfromance run */
+		results[0].seed1=0x3415;
+		results[0].seed2=0x3415;
+		results[0].seed3=0x66;
+	}
+#if (MEM_METHOD==MEM_STATIC)
+	results[0].memblock[0]=(void *)static_memblk;
+	results[0].size=TOTAL_DATA_SIZE;
+	results[0].err=0;
+	#if (MULTITHREAD>1)
+	#error "Cannot use a static data area with multiple contexts!"
+	#endif
+#elif (MEM_METHOD==MEM_MALLOC)
+	for (i=0 ; i<MULTITHREAD; i++) {
+		ee_s32 malloc_override=get_seed(7);
+		if (malloc_override != 0)
+			results[i].size=malloc_override;
+		else
+			results[i].size=TOTAL_DATA_SIZE;
+		results[i].memblock[0]=portable_malloc(results[i].size);
+		results[i].seed1=results[0].seed1;
+		results[i].seed2=results[0].seed2;
+		results[i].seed3=results[0].seed3;
+		results[i].err=0;
+		results[i].execs=results[0].execs;
+	}
+#elif (MEM_METHOD==MEM_STACK)
+	for (i=0 ; i<MULTITHREAD; i++) {
+		results[i].memblock[0]=stack_memblock+i*TOTAL_DATA_SIZE;
+		results[i].size=TOTAL_DATA_SIZE;
+		results[i].seed1=results[0].seed1;
+		results[i].seed2=results[0].seed2;
+		results[i].seed3=results[0].seed3;
+		results[i].err=0;
+		results[i].execs=results[0].execs;
+	}
+#else
+#error "Please define a way to initialize a memory block."
+#endif
+	/* Data init */
+	/* Find out how space much we have based on number of algorithms */
+	for (i=0; i<NUM_ALGORITHMS; i++) {
+		if ((1<<(ee_u32)i) & results[0].execs)
+			num_algorithms++;
+	}
+	for (i=0 ; i<MULTITHREAD; i++)
+		results[i].size=results[i].size/num_algorithms;
+	/* Assign pointers */
+	for (i=0; i<NUM_ALGORITHMS; i++) {
+		ee_u32 ctx;
+		if ((1<<(ee_u32)i) & results[0].execs) {
+			for (ctx=0 ; ctx<MULTITHREAD; ctx++)
+				results[ctx].memblock[i+1]=(char *)(results[ctx].memblock[0])+results[0].size*j;
+			j++;
+		}
+	}
+	/* call inits */
+	for (i=0 ; i<MULTITHREAD; i++) {
+		if (results[i].execs & ID_LIST) {
+			results[i].list=core_list_init(results[0].size,results[i].memblock[1],results[i].seed1);
+		}
+		if (results[i].execs & ID_MATRIX) {
+			core_init_matrix(results[0].size, results[i].memblock[2], (ee_s32)results[i].seed1 | (((ee_s32)results[i].seed2) << 16), &(results[i].mat) );
+		}
+		if (results[i].execs & ID_STATE) {
+			core_init_state(results[0].size,results[i].seed1,results[i].memblock[3]);
+		}
+	}
+
+	/* automatically determine number of iterations if not set */
+	if (results[0].iterations==0) {
+		secs_ret secs_passed=0;
+		ee_u32 divisor;
+		results[0].iterations=1;
+		while (secs_passed < (secs_ret)1) {
+			results[0].iterations*=10;
+			start_time();
+			iterate(&results[0]);
+			stop_time();
+			secs_passed=time_in_secs(get_time());
+		}
+		/* now we know it executes for at least 1 sec, set actual run time at about 10 secs */
+		divisor=(ee_u32)secs_passed;
+		if (divisor==0) /* some machines cast float to int as 0 since this conversion is not defined by ANSI, but we know at least one second passed */
+			divisor=1;
+		results[0].iterations*=1+10/divisor;
+	}
+	/* perform actual benchmark */
+	start_time();
+#if (MULTITHREAD>1)
+	if (default_num_contexts>MULTITHREAD) {
+		default_num_contexts=MULTITHREAD;
+	}
+	for (i=0 ; i<default_num_contexts; i++) {
+		results[i].iterations=results[0].iterations;
+		results[i].execs=results[0].execs;
+		core_start_parallel(&results[i]);
+	}
+	for (i=0 ; i<default_num_contexts; i++) {
+		core_stop_parallel(&results[i]);
+	}
+#else
+	iterate(&results[0]);
+#endif
+	stop_time();
+	total_time=get_time();
+	/* get a function of the input to report */
+	seedcrc=crc16(results[0].seed1,seedcrc);
+	seedcrc=crc16(results[0].seed2,seedcrc);
+	seedcrc=crc16(results[0].seed3,seedcrc);
+	seedcrc=crc16(results[0].size,seedcrc);
+
+	switch (seedcrc) { /* test known output for common seeds */
+		case 0x8a02: /* seed1=0, seed2=0, seed3=0x66, size 2000 per algorithm */
+			known_id=0;
+			ee_printf("6k performance run parameters for coremark.\n");
+			break;
+		case 0x7b05: /*  seed1=0x3415, seed2=0x3415, seed3=0x66, size 2000 per algorithm */
+			known_id=1;
+			ee_printf("6k validation run parameters for coremark.\n");
+			break;
+		case 0x4eaf: /* seed1=0x8, seed2=0x8, seed3=0x8, size 400 per algorithm */
+			known_id=2;
+			ee_printf("Profile generation run parameters for coremark.\n");
+			break;
+		case 0xe9f5: /* seed1=0, seed2=0, seed3=0x66, size 666 per algorithm */
+			known_id=3;
+			ee_printf("2K performance run parameters for coremark.\n");
+			break;
+		case 0x18f2: /*  seed1=0x3415, seed2=0x3415, seed3=0x66, size 666 per algorithm */
+			known_id=4;
+			ee_printf("2K validation run parameters for coremark.\n");
+			break;
+		default:
+			total_errors=-1;
+			break;
+	}
+	if (known_id>=0) {
+		for (i=0 ; i<default_num_contexts; i++) {
+			results[i].err=0;
+			if ((results[i].execs & ID_LIST) &&
+				(results[i].crclist!=list_known_crc[known_id])) {
+				ee_printf("[%u]ERROR! list crc 0x%04x - should be 0x%04x\n",i,results[i].crclist,list_known_crc[known_id]);
+				results[i].err++;
+			}
+			if ((results[i].execs & ID_MATRIX) &&
+				(results[i].crcmatrix!=matrix_known_crc[known_id])) {
+				ee_printf("[%u]ERROR! matrix crc 0x%04x - should be 0x%04x\n",i,results[i].crcmatrix,matrix_known_crc[known_id]);
+				results[i].err++;
+			}
+			if ((results[i].execs & ID_STATE) &&
+				(results[i].crcstate!=state_known_crc[known_id])) {
+				ee_printf("[%u]ERROR! state crc 0x%04x - should be 0x%04x\n",i,results[i].crcstate,state_known_crc[known_id]);
+				results[i].err++;
+			}
+			total_errors+=results[i].err;
+		}
+	}
+	total_errors+=check_data_types();
+	/* and report results */
+	ee_printf("CoreMark Size    : %d\n",(int)results[0].size);
+#if HAS_FLOAT
+	ee_printf("Total time (ms)  : %f\n",time_in_secs(total_time));
+	if (time_in_secs(total_time) > 0)
+		ee_printf("Iterations/mSec  : %f\n",default_num_contexts*results[0].iterations/time_in_secs(total_time));
+#else
+	ee_printf("Total time (ms)  : %d\n",time_in_secs(total_time));
+#endif
+	ee_printf("Iterations       : %d\n",(int)default_num_contexts*results[0].iterations);
+	ee_printf("Compiler version : %s\n",COMPILER_VERSION);
+#if (MULTITHREAD>1)
+	ee_printf("Parallel %s : %d\n",PARALLEL_METHOD,default_num_contexts);
+#endif
+	/* output for verification */
+	ee_printf("seedcrc          : 0x%04x\n",seedcrc);
+	if (results[0].execs & ID_LIST)
+		for (i=0 ; i<default_num_contexts; i++)
+			ee_printf("[%d]crclist       : 0x%04x\n",i,results[i].crclist);
+	if (results[0].execs & ID_MATRIX)
+		for (i=0 ; i<default_num_contexts; i++)
+			ee_printf("[%d]crcmatrix     : 0x%04x\n",i,results[i].crcmatrix);
+	if (results[0].execs & ID_STATE)
+		for (i=0 ; i<default_num_contexts; i++)
+			ee_printf("[%d]crcstate      : 0x%04x\n",i,results[i].crcstate);
+	for (i=0 ; i<default_num_contexts; i++)
+		ee_printf("[%d]crcfinal      : 0x%04x\n",i,results[i].crc);
+  ee_printf("Finised in %d ms.\n", (int)total_time);
+	if (total_errors==0) {
+    ee_printf("==================================================\n");
+	  ee_printf("CoreMark PASS       %d Marks\n", 2921400 / time_in_secs(total_time) * ITERATIONS / 1000);
+	  ee_printf("                vs. 100000 Marks (i7-7700K @ 4.20GHz)\n");
+  }
+	if (total_errors>0)
+		ee_printf("Errors detected\n");
+	if (total_errors<0)
+		ee_printf("Cannot validate operation for these seed values, please compare with results on a known platform.\n");
+
+#if (MEM_METHOD==MEM_MALLOC)
+	for (i=0 ; i<MULTITHREAD; i++)
+		portable_free(results[i].memblock[0]);
+#endif
+	/* And last call any target specific code for finalizing */
+	portable_fini(&(results[0].port));
+
+	return total_errors;
+}
+
+
--- a/am-kernels/benchmarks/coremark/src/core_matrix.c
+++ b/am-kernels/benchmarks/coremark/src/core_matrix.c
@ -0,0 +1,308 @@
+/*
+Author : Shay Gal-On, EEMBC
+
+This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
+All rights reserved.
+
+EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
+CoreMark License that is distributed with the official EEMBC COREMARK Software release.
+If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
+you must discontinue use and download the official release from www.coremark.org.
+
+Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
+make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
+
+EEMBC
+4354 Town Center Blvd. Suite 114-200
+El Dorado Hills, CA, 95762
+*/
+#include "coremark.h"
+/*
+Topic: Description
+	Matrix manipulation benchmark
+
+	This very simple algorithm forms the basis of many more complex algorithms.
+
+	The tight inner loop is the focus of many optimizations (compiler as well as hardware based)
+	and is thus relevant for embedded processing.
+
+	The total available data space will be divided to 3 parts:
+	NxN Matrix A - initialized with small values (upper 3/4 of the bits all zero).
+	NxN Matrix B - initialized with medium values (upper half of the bits all zero).
+	NxN Matrix C - used for the result.
+
+	The actual values for A and B must be derived based on input that is not available at compile time.
+*/
+ee_s16 matrix_test(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B, MATDAT val);
+ee_s16 matrix_sum(ee_u32 N, MATRES *C, MATDAT clipval);
+void matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val);
+void matrix_mul_vect(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void matrix_mul_matrix(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void matrix_mul_matrix_bitextract(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B);
+void matrix_add_const(ee_u32 N, MATDAT *A, MATDAT val);
+
+#define matrix_test_next(x) (x+1)
+#define matrix_clip(x,y) ((y) ? (x) & 0x0ff : (x) & 0x0ffff)
+#define matrix_big(x) (0xf000 | (x))
+#define bit_extract(x,from,to) (((x)>>(from)) & (~(0xffffffff << (to))))
+
+#if CORE_DEBUG
+void printmat(MATDAT *A, ee_u32 N, char *name) {
+	ee_u32 i,j;
+	ee_printf("Matrix %s [%dx%d]:\n",name,N,N);
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			if (j!=0)
+				ee_printf(",");
+			ee_printf("%d",A[i*N+j]);
+		}
+		ee_printf("\n");
+	}
+}
+void printmatC(MATRES *C, ee_u32 N, char *name) {
+	ee_u32 i,j;
+	ee_printf("Matrix %s [%dx%d]:\n",name,N,N);
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			if (j!=0)
+				ee_printf(",");
+			ee_printf("%d",C[i*N+j]);
+		}
+		ee_printf("\n");
+	}
+}
+#endif
+/* Function: core_bench_matrix
+	Benchmark function
+
+	Iterate <matrix_test> N times,
+	changing the matrix values slightly by a constant amount each time.
+*/
+ee_u16 core_bench_matrix(mat_params *p, ee_s16 seed, ee_u16 crc) {
+	ee_u32 N=p->N;
+	MATRES *C=p->C;
+	MATDAT *A=p->A;
+	MATDAT *B=p->B;
+	MATDAT val=(MATDAT)seed;
+
+	crc=crc16(matrix_test(N,C,A,B,val),crc);
+
+	return crc;
+}
+
+/* Function: matrix_test
+	Perform matrix manipulation.
+
+	Parameters:
+	N - Dimensions of the matrix.
+	C - memory for result matrix.
+	A - input matrix
+	B - operator matrix (not changed during operations)
+
+	Returns:
+	A CRC value that captures all results calculated in the function.
+	In particular, crc of the value calculated on the result matrix
+	after each step by <matrix_sum>.
+
+	Operation:
+
+	1 - Add a constant value to all elements of a matrix.
+	2 - Multiply a matrix by a constant.
+	3 - Multiply a matrix by a vector.
+	4 - Multiply a matrix by a matrix.
+	5 - Add a constant value to all elements of a matrix.
+
+	After the last step, matrix A is back to original contents.
+*/
+ee_s16 matrix_test(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B, MATDAT val) {
+	ee_u16 crc=0;
+	MATDAT clipval=matrix_big(val);
+
+	matrix_add_const(N,A,val); /* make sure data changes  */
+#if CORE_DEBUG
+	printmat(A,N,"matrix_add_const");
+#endif
+	matrix_mul_const(N,C,A,val);
+	crc=crc16(matrix_sum(N,C,clipval),crc);
+#if CORE_DEBUG
+	printmatC(C,N,"matrix_mul_const");
+#endif
+	matrix_mul_vect(N,C,A,B);
+	crc=crc16(matrix_sum(N,C,clipval),crc);
+#if CORE_DEBUG
+	printmatC(C,N,"matrix_mul_vect");
+#endif
+	matrix_mul_matrix(N,C,A,B);
+	crc=crc16(matrix_sum(N,C,clipval),crc);
+#if CORE_DEBUG
+	printmatC(C,N,"matrix_mul_matrix");
+#endif
+	matrix_mul_matrix_bitextract(N,C,A,B);
+	crc=crc16(matrix_sum(N,C,clipval),crc);
+#if CORE_DEBUG
+	printmatC(C,N,"matrix_mul_matrix_bitextract");
+#endif
+
+	matrix_add_const(N,A,-val); /* return matrix to initial value */
+	return crc;
+}
+
+/* Function : matrix_init
+	Initialize the memory block for matrix benchmarking.
+
+	Parameters:
+	blksize - Size of memory to be initialized.
+	memblk - Pointer to memory block.
+	seed - Actual values chosen depend on the seed parameter.
+	p - pointers to <mat_params> containing initialized matrixes.
+
+	Returns:
+	Matrix dimensions.
+
+	Note:
+	The seed parameter MUST be supplied from a source that cannot be determined at compile time
+*/
+ee_u32 core_init_matrix(ee_u32 blksize, void *memblk, ee_s32 seed, mat_params *p) {
+	ee_u32 N=0;
+	MATDAT *A;
+	MATDAT *B;
+	ee_s32 order=1;
+	MATDAT val;
+	ee_u32 i=0,j=0;
+	if (seed==0)
+		seed=1;
+	while (j<blksize) {
+		i++;
+		j=i*i*2*4;
+	}
+	N=i-1;
+	A=(MATDAT *)align_mem(memblk);
+	B=A+N*N;
+
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			seed = ( ( order * seed ) % 65536 );
+			val = (seed + order);
+			val=matrix_clip(val,0);
+			B[i*N+j] = val;
+			val =  (val + order);
+			val=matrix_clip(val,1);
+			A[i*N+j] = val;
+			order++;
+		}
+	}
+
+	p->A=A;
+	p->B=B;
+	p->C=(MATRES *)align_mem(B+N*N);
+	p->N=N;
+#if CORE_DEBUG
+	printmat(A,N,"A");
+	printmat(B,N,"B");
+#endif
+	return N;
+}
+
+/* Function: matrix_sum
+	Calculate a function that depends on the values of elements in the matrix.
+
+	For each element, accumulate into a temporary variable.
+
+	As long as this value is under the parameter clipval,
+	add 1 to the result if the element is bigger then the previous.
+
+	Otherwise, reset the accumulator and add 10 to the result.
+*/
+ee_s16 matrix_sum(ee_u32 N, MATRES *C, MATDAT clipval) {
+	MATRES tmp=0,prev=0,cur=0;
+	ee_s16 ret=0;
+	ee_u32 i,j;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			cur=C[i*N+j];
+			tmp+=cur;
+			if (tmp>clipval) {
+				ret+=10;
+				tmp=0;
+			} else {
+				ret += (cur>prev) ? 1 : 0;
+			}
+			prev=cur;
+		}
+	}
+	return ret;
+}
+
+/* Function: matrix_mul_const
+	Multiply a matrix by a constant.
+	This could be used as a scaler for instance.
+*/
+void matrix_mul_const(ee_u32 N, MATRES *C, MATDAT *A, MATDAT val) {
+	ee_u32 i,j;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			C[i*N+j]=(MATRES)A[i*N+j] * (MATRES)val;
+		}
+	}
+}
+
+/* Function: matrix_add_const
+	Add a constant value to all elements of a matrix.
+*/
+void matrix_add_const(ee_u32 N, MATDAT *A, MATDAT val) {
+	ee_u32 i,j;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			A[i*N+j] += val;
+		}
+	}
+}
+
+/* Function: matrix_mul_vect
+	Multiply a matrix by a vector.
+	This is common in many simple filters (e.g. fir where a vector of coefficients is applied to the matrix.)
+*/
+void matrix_mul_vect(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
+	ee_u32 i,j;
+	for (i=0; i<N; i++) {
+		C[i]=0;
+		for (j=0; j<N; j++) {
+			C[i]+=(MATRES)A[i*N+j] * (MATRES)B[j];
+		}
+	}
+}
+
+/* Function: matrix_mul_matrix
+	Multiply a matrix by a matrix.
+	Basic code is used in many algorithms, mostly with minor changes such as scaling.
+*/
+void matrix_mul_matrix(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
+	ee_u32 i,j,k;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			C[i*N+j]=0;
+			for(k=0;k<N;k++)
+			{
+				C[i*N+j]+=(MATRES)A[i*N+k] * (MATRES)B[k*N+j];
+			}
+		}
+	}
+}
+
+/* Function: matrix_mul_matrix_bitextract
+	Multiply a matrix by a matrix, and extract some bits from the result.
+	Basic code is used in many algorithms, mostly with minor changes such as scaling.
+*/
+void matrix_mul_matrix_bitextract(ee_u32 N, MATRES *C, MATDAT *A, MATDAT *B) {
+	ee_u32 i,j,k;
+	for (i=0; i<N; i++) {
+		for (j=0; j<N; j++) {
+			C[i*N+j]=0;
+			for(k=0;k<N;k++)
+			{
+				MATRES tmp=(MATRES)A[i*N+k] * (MATRES)B[k*N+j];
+				C[i*N+j]+=bit_extract(tmp,2,4)*bit_extract(tmp,5,7);
+			}
+		}
+	}
+}
--- a/am-kernels/benchmarks/coremark/src/core_portme.c
+++ b/am-kernels/benchmarks/coremark/src/core_portme.c
@ -0,0 +1,109 @@
+#include "coremark.h"
+
+#if VALIDATION_RUN
+	volatile ee_s32 seed1_volatile=0x3415;
+	volatile ee_s32 seed2_volatile=0x3415;
+	volatile ee_s32 seed3_volatile=0x66;
+#endif
+#if PERFORMANCE_RUN
+	volatile ee_s32 seed1_volatile=0x0;
+	volatile ee_s32 seed2_volatile=0x0;
+	volatile ee_s32 seed3_volatile=0x66;
+#endif
+#if PROFILE_RUN
+	volatile ee_s32 seed1_volatile=0x8;
+	volatile ee_s32 seed2_volatile=0x8;
+	volatile ee_s32 seed3_volatile=0x8;
+#endif
+	volatile ee_s32 seed4_volatile=ITERATIONS;
+	volatile ee_s32 seed5_volatile=0;
+/* Porting : Timing functions
+	How to capture time and convert to seconds must be ported to whatever is supported by the platform.
+	e.g. Read value from on board RTC, read value from cpu clock cycles performance counter etc.
+	Sample implementation for standard time.h and windows.h definitions included.
+*/
+/* Define : TIMER_RES_DIVIDER
+	Divider to trade off timer resolution and total time that can be measured.
+
+	Use lower values to increase resolution, but make sure that overflow does not occur.
+	If there are issues with the return value overflowing, increase this value.
+	*/
+#define NSECS_PER_SEC CLOCKS_PER_SEC
+#define CORETIMETYPE clock_t
+#define GETMYTIME(_t) (*_t=clock())
+#define MYTIMEDIFF(fin,ini) ((fin)-(ini))
+#define TIMER_RES_DIVIDER 1
+#define SAMPLE_TIME_IMPLEMENTATION 1
+#define EE_TICKS_PER_SEC (NSECS_PER_SEC / TIMER_RES_DIVIDER)
+
+static uint32_t uptime_ms() { return io_read(AM_TIMER_UPTIME).us / 1000; }
+
+/** Define Host specific (POSIX), or target specific global time variables. */
+unsigned long start_time_val, stop_time_val;
+
+/* Function : start_time
+	This function will be called right before starting the timed portion of the benchmark.
+
+	Implementation may be capturing a system timer (as implemented in the example code)
+	or zeroing some system parameters - e.g. setting the cpu clocks cycles to 0.
+*/
+void start_time(void) {
+  start_time_val = uptime_ms();
+}
+/* Function : stop_time
+	This function will be called right after ending the timed portion of the benchmark.
+
+	Implementation may be capturing a system timer (as implemented in the example code)
+	or other system parameters - e.g. reading the current value of cpu cycles counter.
+*/
+void stop_time(void) {
+  stop_time_val = uptime_ms();
+}
+/* Function : get_time
+	Return an abstract "ticks" number that signifies time on the system.
+
+	Actual value returned may be cpu cycles, milliseconds or any other value,
+	as long as it can be converted to seconds by <time_in_secs>.
+	This methodology is taken to accomodate any hardware or simulated platform.
+	The sample implementation returns millisecs by default,
+	and the resolution is controlled by <TIMER_RES_DIVIDER>
+*/
+CORE_TICKS get_time(void) {
+  return stop_time_val - start_time_val;
+}
+
+/* Function : time_in_secs
+	Convert the value returned by get_time to seconds.
+
+	The <secs_ret> type is used to accomodate systems with no support for floating point.
+	Default implementation implemented by the EE_TICKS_PER_SEC macro above.
+*/
+secs_ret time_in_secs(CORE_TICKS ticks) {
+  return ticks;
+}
+
+ee_u32 default_num_contexts=1;
+
+/* Function : portable_init
+	Target specific initialization code
+	Test for some common mistakes.
+*/
+void portable_init(core_portable *p, int *argc, char *argv[])
+{
+	if (sizeof(ee_ptr_int) != sizeof(ee_u8 *)) {
+		ee_printf("ERROR! Please define ee_ptr_int to a type that holds a pointer!\n");
+	}
+	if (sizeof(ee_u32) != 4) {
+		ee_printf("ERROR! Please define ee_u32 to a 32b unsigned type!\n");
+	}
+	p->portable_id=1;
+}
+/* Function : portable_fini
+	Target specific final code
+*/
+void portable_fini(core_portable *p)
+{
+	p->portable_id=0;
+}
+
+
--- a/am-kernels/benchmarks/coremark/src/core_state.c
+++ b/am-kernels/benchmarks/coremark/src/core_state.c
@ -0,0 +1,277 @@
+/*
+Author : Shay Gal-On, EEMBC
+
+This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
+All rights reserved.
+
+EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
+CoreMark License that is distributed with the official EEMBC COREMARK Software release.
+If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
+you must discontinue use and download the official release from www.coremark.org.
+
+Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
+make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
+
+EEMBC
+4354 Town Center Blvd. Suite 114-200
+El Dorado Hills, CA, 95762
+*/
+#include "coremark.h"
+/* local functions */
+enum CORE_STATE core_state_transition( ee_u8 **instr , ee_u32 *transition_count);
+
+/*
+Topic: Description
+	Simple state machines like this one are used in many embedded products.
+
+	For more complex state machines, sometimes a state transition table implementation is used instead,
+	trading speed of direct coding for ease of maintenance.
+
+	Since the main goal of using a state machine in CoreMark is to excercise the switch/if behaviour,
+	we are using a small moore machine.
+
+	In particular, this machine tests type of string input,
+	trying to determine whether the input is a number or something else.
+	(see core_state.png).
+*/
+
+/* Function: core_bench_state
+	Benchmark function
+
+	Go over the input twice, once direct, and once after introducing some corruption.
+*/
+ee_u16 core_bench_state(ee_u32 blksize, ee_u8 *memblock,
+		ee_s16 seed1, ee_s16 seed2, ee_s16 step, ee_u16 crc)
+{
+	ee_u32 final_counts[NUM_CORE_STATES];
+	ee_u32 track_counts[NUM_CORE_STATES];
+	ee_u8 *p=memblock;
+	ee_u32 i;
+
+
+#if CORE_DEBUG
+	ee_printf("State Bench: %d,%d,%d,%04x\n",seed1,seed2,step,crc);
+#endif
+	for (i=0; i<NUM_CORE_STATES; i++) {
+		final_counts[i]=track_counts[i]=0;
+	}
+	/* run the state machine over the input */
+	while (*p!=0) {
+		enum CORE_STATE fstate=core_state_transition(&p,track_counts);
+		final_counts[fstate]++;
+#if CORE_DEBUG
+	ee_printf("%d,",fstate);
+	}
+	ee_printf("\n");
+#else
+	}
+#endif
+	p=memblock;
+	while (p < (memblock+blksize)) { /* insert some corruption */
+		if (*p!=',')
+			*p^=(ee_u8)seed1;
+		p+=step;
+	}
+	p=memblock;
+	/* run the state machine over the input again */
+	while (*p!=0) {
+		enum CORE_STATE fstate=core_state_transition(&p,track_counts);
+		final_counts[fstate]++;
+#if CORE_DEBUG
+	ee_printf("%d,",fstate);
+	}
+	ee_printf("\n");
+#else
+	}
+#endif
+	p=memblock;
+	while (p < (memblock+blksize)) { /* undo corruption is seed1 and seed2 are equal */
+		if (*p!=',')
+			*p^=(ee_u8)seed2;
+		p+=step;
+	}
+	/* end timing */
+	for (i=0; i<NUM_CORE_STATES; i++) {
+		crc=crcu32(final_counts[i],crc);
+		crc=crcu32(track_counts[i],crc);
+	}
+	return crc;
+}
+
+/* Default initialization patterns */
+static ee_u8 *intpat[4]  ={(ee_u8 *)"5012",(ee_u8 *)"1234",(ee_u8 *)"-874",(ee_u8 *)"+122"};
+static ee_u8 *floatpat[4]={(ee_u8 *)"35.54400",(ee_u8 *)".1234500",(ee_u8 *)"-110.700",(ee_u8 *)"+0.64400"};
+static ee_u8 *scipat[4]  ={(ee_u8 *)"5.500e+3",(ee_u8 *)"-.123e-2",(ee_u8 *)"-87e+832",(ee_u8 *)"+0.6e-12"};
+static ee_u8 *errpat[4]  ={(ee_u8 *)"T0.3e-1F",(ee_u8 *)"-T.T++Tq",(ee_u8 *)"1T3.4e4z",(ee_u8 *)"34.0e-T^"};
+
+/* Function: core_init_state
+	Initialize the input data for the state machine.
+
+	Populate the input with several predetermined strings, interspersed.
+	Actual patterns chosen depend on the seed parameter.
+
+	Note:
+	The seed parameter MUST be supplied from a source that cannot be determined at compile time
+*/
+void core_init_state(ee_u32 size, ee_s16 seed, ee_u8 *p) {
+	ee_u32 total=0,next=0,i;
+	ee_u8 *buf=0;
+#if CORE_DEBUG
+	ee_u8 *start=p;
+	ee_printf("State: %d,%d\n",size,seed);
+#endif
+	size--;
+	next=0;
+	while ((total+next+1)<size) {
+		if (next>0) {
+			for(i=0;i<next;i++)
+				*(p+total+i)=buf[i];
+			*(p+total+i)=',';
+			total+=next+1;
+		}
+		seed++;
+		switch (seed & 0x7) {
+			case 0: /* int */
+			case 1: /* int */
+			case 2: /* int */
+				buf=intpat[(seed>>3) & 0x3];
+				next=4;
+			break;
+			case 3: /* float */
+			case 4: /* float */
+				buf=floatpat[(seed>>3) & 0x3];
+				next=8;
+			break;
+			case 5: /* scientific */
+			case 6: /* scientific */
+				buf=scipat[(seed>>3) & 0x3];
+				next=8;
+			break;
+			case 7: /* invalid */
+				buf=errpat[(seed>>3) & 0x3];
+				next=8;
+			break;
+			default: /* Never happen, just to make some compilers happy */
+			break;
+		}
+	}
+	size++;
+	while (total<size) { /* fill the rest with 0 */
+		*(p+total)=0;
+		total++;
+	}
+#if CORE_DEBUG
+	ee_printf("State Input: %s\n",start);
+#endif
+}
+
+static ee_u8 ee_isdigit(ee_u8 c) {
+	ee_u8 retval;
+	retval = ((c>='0') & (c<='9')) ? 1 : 0;
+	return retval;
+}
+
+/* Function: core_state_transition
+	Actual state machine.
+
+	The state machine will continue scanning until either:
+	1 - an invalid input is detcted.
+	2 - a valid number has been detected.
+
+	The input pointer is updated to point to the end of the token, and the end state is returned (either specific format determined or invalid).
+*/
+
+enum CORE_STATE core_state_transition( ee_u8 **instr , ee_u32 *transition_count) {
+	ee_u8 *str=*instr;
+	ee_u8 NEXT_SYMBOL;
+	enum CORE_STATE state=CORE_START;
+	for( ; *str && state != CORE_INVALID; str++ ) {
+		NEXT_SYMBOL = *str;
+		if (NEXT_SYMBOL==',') /* end of this input */ {
+			str++;
+			break;
+		}
+		switch(state) {
+		case CORE_START:
+			if(ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INT;
+			}
+			else if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) {
+				state = CORE_S1;
+			}
+			else if( NEXT_SYMBOL == '.' ) {
+				state = CORE_FLOAT;
+			}
+			else {
+				state = CORE_INVALID;
+				transition_count[CORE_INVALID]++;
+			}
+			transition_count[CORE_START]++;
+			break;
+		case CORE_S1:
+			if(ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INT;
+				transition_count[CORE_S1]++;
+			}
+			else if( NEXT_SYMBOL == '.' ) {
+				state = CORE_FLOAT;
+				transition_count[CORE_S1]++;
+			}
+			else {
+				state = CORE_INVALID;
+				transition_count[CORE_S1]++;
+			}
+			break;
+		case CORE_INT:
+			if( NEXT_SYMBOL == '.' ) {
+				state = CORE_FLOAT;
+				transition_count[CORE_INT]++;
+			}
+			else if(!ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INVALID;
+				transition_count[CORE_INT]++;
+			}
+			break;
+		case CORE_FLOAT:
+			if( NEXT_SYMBOL == 'E' || NEXT_SYMBOL == 'e' ) {
+				state = CORE_S2;
+				transition_count[CORE_FLOAT]++;
+			}
+			else if(!ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INVALID;
+				transition_count[CORE_FLOAT]++;
+			}
+			break;
+		case CORE_S2:
+			if( NEXT_SYMBOL == '+' || NEXT_SYMBOL == '-' ) {
+				state = CORE_EXPONENT;
+				transition_count[CORE_S2]++;
+			}
+			else {
+				state = CORE_INVALID;
+				transition_count[CORE_S2]++;
+			}
+			break;
+		case CORE_EXPONENT:
+			if(ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_SCIENTIFIC;
+				transition_count[CORE_EXPONENT]++;
+			}
+			else {
+				state = CORE_INVALID;
+				transition_count[CORE_EXPONENT]++;
+			}
+			break;
+		case CORE_SCIENTIFIC:
+			if(!ee_isdigit(NEXT_SYMBOL)) {
+				state = CORE_INVALID;
+				transition_count[CORE_INVALID]++;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+	*instr=str;
+	return state;
+}
--- a/am-kernels/benchmarks/coremark/src/core_util.c
+++ b/am-kernels/benchmarks/coremark/src/core_util.c
@ -0,0 +1,210 @@
+/*
+Author : Shay Gal-On, EEMBC
+
+This file is part of  EEMBC(R) and CoreMark(TM), which are Copyright (C) 2009
+All rights reserved.
+
+EEMBC CoreMark Software is a product of EEMBC and is provided under the terms of the
+CoreMark License that is distributed with the official EEMBC COREMARK Software release.
+If you received this EEMBC CoreMark Software without the accompanying CoreMark License,
+you must discontinue use and download the official release from www.coremark.org.
+
+Also, if you are publicly displaying scores generated from the EEMBC CoreMark software,
+make sure that you are in compliance with Run and Reporting rules specified in the accompanying readme.txt file.
+
+EEMBC
+4354 Town Center Blvd. Suite 114-200
+El Dorado Hills, CA, 95762
+*/
+#include "coremark.h"
+/* Function: get_seed
+	Get a values that cannot be determined at compile time.
+
+	Since different embedded systems and compilers are used, 3 different methods are provided:
+	1 - Using a volatile variable. This method is only valid if the compiler is forced to generate code that
+	reads the value of a volatile variable from memory at run time.
+	Please note, if using this method, you would need to modify core_portme.c to generate training profile.
+	2 - Command line arguments. This is the preferred method if command line arguments are supported.
+	3 - System function. If none of the first 2 methods is available on the platform,
+	a system function which is not a stub can be used.
+
+	e.g. read the value on GPIO pins connected to switches, or invoke special simulator functions.
+*/
+#if (SEED_METHOD==SEED_VOLATILE)
+	extern volatile ee_s32 seed1_volatile;
+	extern volatile ee_s32 seed2_volatile;
+	extern volatile ee_s32 seed3_volatile;
+	extern volatile ee_s32 seed4_volatile;
+	extern volatile ee_s32 seed5_volatile;
+	ee_s32 get_seed_32(int i) {
+		ee_s32 retval;
+		switch (i) {
+			case 1:
+				retval=seed1_volatile;
+				break;
+			case 2:
+				retval=seed2_volatile;
+				break;
+			case 3:
+				retval=seed3_volatile;
+				break;
+			case 4:
+				retval=seed4_volatile;
+				break;
+			case 5:
+				retval=seed5_volatile;
+				break;
+			default:
+				retval=0;
+				break;
+		}
+		return retval;
+	}
+#elif (SEED_METHOD==SEED_ARG)
+ee_s32 parseval(char *valstring) {
+	ee_s32 retval=0;
+	ee_s32 neg=1;
+	int hexmode=0;
+	if (*valstring == '-') {
+		neg=-1;
+		valstring++;
+	}
+	if ((valstring[0] == '0') && (valstring[1] == 'x')) {
+		hexmode=1;
+		valstring+=2;
+	}
+		/* first look for digits */
+	if (hexmode) {
+		while (((*valstring >= '0') && (*valstring <= '9')) || ((*valstring >= 'a') && (*valstring <= 'f'))) {
+			ee_s32 digit=*valstring-'0';
+			if (digit>9)
+				digit=10+*valstring-'a';
+			retval*=16;
+			retval+=digit;
+			valstring++;
+		}
+	} else {
+		while ((*valstring >= '0') && (*valstring <= '9')) {
+			ee_s32 digit=*valstring-'0';
+			retval*=10;
+			retval+=digit;
+			valstring++;
+		}
+	}
+	/* now add qualifiers */
+	if (*valstring=='K')
+		retval*=1024;
+	if (*valstring=='M')
+		retval*=1024*1024;
+
+	retval*=neg;
+	return retval;
+}
+
+ee_s32 get_seed_args(int i, int argc, char *argv[]) {
+	if (argc>i)
+		return parseval(argv[i]);
+	return 0;
+}
+
+#elif (SEED_METHOD==SEED_FUNC)
+/* If using OS based function, you must define and implement the functions below in core_portme.h and core_portme.c ! */
+ee_s32 get_seed_32(int i) {
+	ee_s32 retval;
+	switch (i) {
+		case 1:
+			retval=portme_sys1();
+			break;
+		case 2:
+			retval=portme_sys2();
+			break;
+		case 3:
+			retval=portme_sys3();
+			break;
+		case 4:
+			retval=portme_sys4();
+			break;
+		case 5:
+			retval=portme_sys5();
+			break;
+		default:
+			retval=0;
+			break;
+	}
+	return retval;
+}
+#endif
+
+/* Function: crc*
+	Service functions to calculate 16b CRC code.
+
+*/
+ee_u16 crcu8(ee_u8 data, ee_u16 crc )
+{
+	ee_u8 i=0,x16=0,carry=0;
+
+	for (i = 0; i < 8; i++)
+    {
+		x16 = (ee_u8)((data & 1) ^ ((ee_u8)crc & 1));
+		data >>= 1;
+
+		if (x16 == 1)
+		{
+		   crc ^= 0x4002;
+		   carry = 1;
+		}
+		else
+			carry = 0;
+		crc >>= 1;
+		if (carry)
+		   crc |= 0x8000;
+		else
+		   crc &= 0x7fff;
+    }
+	return crc;
+}
+ee_u16 crcu16(ee_u16 newval, ee_u16 crc) {
+	crc=crcu8( (ee_u8) (newval)				,crc);
+	crc=crcu8( (ee_u8) ((newval)>>8)	,crc);
+	return crc;
+}
+ee_u16 crcu32(ee_u32 newval, ee_u16 crc) {
+	crc=crc16((ee_s16) newval		,crc);
+	crc=crc16((ee_s16) (newval>>16)	,crc);
+	return crc;
+}
+ee_u16 crc16(ee_s16 newval, ee_u16 crc) {
+	return crcu16((ee_u16)newval, crc);
+}
+
+ee_u8 check_data_types() {
+	ee_u8 retval=0;
+	if (sizeof(ee_u8) != 1) {
+		ee_printf("ERROR: ee_u8 is not an 8b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_u16) != 2) {
+		ee_printf("ERROR: ee_u16 is not a 16b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_s16) != 2) {
+		ee_printf("ERROR: ee_s16 is not a 16b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_s32) != 4) {
+		ee_printf("ERROR: ee_s32 is not a 32b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_u32) != 4) {
+		ee_printf("ERROR: ee_u32 is not a 32b datatype!\n");
+		retval++;
+	}
+	if (sizeof(ee_ptr_int) != sizeof(int *)) {
+		ee_printf("ERROR: ee_ptr_int is not a datatype that holds an int pointer!\n");
+		retval++;
+	}
+	if (retval>0) {
+		ee_printf("ERROR: Please modify the datatypes in core_portme.h!\n");
+	}
+	return retval;
+}
--- a/am-kernels/benchmarks/dhrystone/Makefile
+++ b/am-kernels/benchmarks/dhrystone/Makefile
@ -0,0 +1,3 @@
+NAME = dhrystone
+SRCS = dry.c
+include $(AM_HOME)/Makefile
--- a/am-kernels/benchmarks/dhrystone/dry.c
+++ b/am-kernels/benchmarks/dhrystone/dry.c
@ -0,0 +1,950 @@
+/****************** "DHRYSTONE" Benchmark Program ***************************/
+#define Version "C, Version 2.2"
+/*  File:       dhry_1.c (part 2 of 3)
+ *  Author:     Reinhold P. Weicker
+ *              Siemens Nixdorf, Paderborn/Germany
+ *              weicker@specbench.org
+ *  Date:       May 25, 1988
+ *  Modified:	Steven Pemberton, CWI, Amsterdam; Steven.Pemberton@cwi.nl
+ *  Date:       October, 1993; March 1995
+ *              Included both files into one source, that gets compiled
+ *              in two passes. Made program auto-compiling, and auto-running,
+ *              and generally made it much easier to use.
+ *
+ *              Original Version (in Ada) published in
+ *              "Communications of the ACM" vol. 27., no. 10 (Oct. 1984),
+ *              pp. 1013 - 1030, together with the statistics
+ *              on which the distribution of statements etc. is based.
+ *
+ *              In this C version, the following C library functions are used:
+ *              - strcpy, strcmp (inside the measurement loop)
+ *              - printf, scanf (outside the measurement loop)
+ *              In addition, Berkeley UNIX system calls "times ()" or "time ()"
+ *              are used for execution time measurement. For measurements
+ *              on other systems, these calls have to be changed.
+ *
+ *  Collection of Results:
+ *              Reinhold Weicker (address see above) and
+ *
+ *              Rick Richardson
+ *              PC Research. Inc.
+ *              94 Apple Orchard Drive
+ *              Tinton Falls, NJ 07724
+ *                      Phone:  (201) 389-8963 (9-17 EST)
+ *                      Usenet: ...!uunet!pcrat!rick
+ *
+ *      Please send results to Rick Richardson and/or Reinhold Weicker.
+ *      Complete information should be given on hardware and software used.
+ *      Hardware information includes: Machine type, CPU, type and size
+ *      of caches; for microprocessors: clock frequency, memory speed
+ *      (number of wait states).
+ *      Software information includes: Compiler (and runtime library)
+ *      manufacturer and version, compilation switches, OS version.
+ *      The Operating System version may give an indication about the compiler;
+ *      Dhrystone itself performs no OS calls in the measurement loop.
+ *
+ *      The complete output generated by the program should be mailed
+ *      such that at least some checks for correctness can be made.
+ *
+ ***************************************************************************
+ *
+ * Defines:     The following "Defines" are possible:
+ *      -DREG          (default: Not defined)
+ *              As an approximation to what an average C programmer
+ *              might do, causes the "register" storage class to be applied
+ *              - for local variables, if they are used (dynamically)
+ *                five or more times
+ *              - for parameters if they are used (dynamically)
+ *                six or more times
+ *              Note that an optimal "register" strategy is
+ *              compiler-dependent, and that "register" declarations
+ *              do not necessarily lead to faster execution.
+ *      -DNOSTRUCTASSIGN        (default: Not defined)
+ *              Define if the C compiler does not support
+ *              assignment of structures.
+ *      -DNOENUMS               (default: Not defined)
+ *              Define if the C compiler does not support
+ *              enumeration types.
+ *      -DTIMES                 (default)
+ *      -DTIME
+ *              The "times" function of UNIX (returning process times)
+ *              or the "time" function (returning wallclock time)
+ *              is used for measurement.
+ *              For single user machines, "time ()" is adequate. For
+ *              multi-user machines where you cannot get single-user
+ *              access, use the "times ()" function. If you have
+ *              neither, use a stopwatch in the dead of night.
+ *              "printf"s are provided marking the points "Start Timer"
+ *              and "Stop Timer". DO NOT use the UNIX "time(1)"
+ *              command, as this will measure the total time to
+ *              run this program, which will (erroneously) include
+ *              the time to allocate storage (malloc) and to perform
+ *              the initialization.
+ *      -DHZ=nnn
+ *              In Berkeley UNIX, the function "times" returns process
+ *              time in 1/HZ seconds, with HZ = 60 for most systems.
+ *              CHECK YOUR SYSTEM DESCRIPTION BEFORE YOU JUST APPLY
+ *              A VALUE.
+ *
+ ***************************************************************************
+ *
+ *  History:	Version C/2.1 was made for two reasons:
+ *
+ *	1) There was an obvious need for a common C version of
+ *      Dhrystone, since C is at present the most popular system
+ *      programming language for the class of processors
+ *      (microcomputers, minicomputers) where Dhrystone is used most.
+ *      There should be, as far as possible, only one C version of
+ *      Dhrystone such that results can be compared without
+ *      restrictions. In the past, the C versions distributed
+ *      by Rick Richardson (Version 1.1) and by Reinhold Weicker
+ *      had small (though not significant) differences.
+ *
+ *      2) As far as it is possible without changes to the Dhrystone
+ *      statistics, optimizing compilers should be prevented from
+ *      removing significant statements.
+ *
+ *      This C version has been developed in cooperation with
+ *      Rick Richardson (Tinton Falls, NJ), it incorporates many
+ *      ideas from the "Version 1.1" distributed previously by
+ *      him over the UNIX network Usenet.
+ *      I also thank Chaim Benedelac (National Semiconductor),
+ *      David Ditzel (SUN), Earl Killian and John Mashey (MIPS),
+ *      Alan Smith and Rafael Saavedra-Barrera (UC at Berkeley)
+ *      for their help with comments on earlier versions of the
+ *      benchmark.
+ *
+ *  Changes:    In the initialization part, this version follows mostly
+ *      Rick Richardson's version distributed via Usenet, not the
+ *      version distributed earlier via floppy disk by Reinhold Weicker.
+ *      As a concession to older compilers, names have been made
+ *      unique within the first 8 characters.
+ *      Inside the measurement loop, this version follows the
+ *      version previously distributed by Reinhold Weicker.
+ *
+ *      At several places in the benchmark, code has been added,
+ *      but within the measurement loop only in branches that
+ *      are not executed. The intention is that optimizing compilers
+ *      should be prevented from moving code out of the measurement
+ *      loop, or from removing code altogether. Since the statements
+ *      that are executed within the measurement loop have NOT been
+ *      changed, the numbers defining the "Dhrystone distribution"
+ *      (distribution of statements, operand types and locality)
+ *      still hold. Except for sophisticated optimizing compilers,
+ *      execution times for this version should be the same as
+ *      for previous versions.
+ *
+ *      Since it has proven difficult to subtract the time for the
+ *      measurement loop overhead in a correct way, the loop check
+ *      has been made a part of the benchmark. This does have
+ *      an impact - though a very minor one - on the distribution
+ *      statistics which have been updated for this version.
+ *
+ *      All changes within the measurement loop are described
+ *      and discussed in the companion paper "Rationale for
+ *      Dhrystone version 2".
+ *
+ *      Because of the self-imposed limitation that the order and
+ *      distribution of the executed statements should not be
+ *      changed, there are still cases where optimizing compilers
+ *      may not generate code for some statements. To a certain
+ *      degree, this is unavoidable for small synthetic benchmarks.
+ *      Users of the benchmark are advised to check code listings
+ *      whether code is generated for all statements of Dhrystone.
+ *
+ *      Version 2.1 is identical to version 2.0 distributed via
+ *      the UNIX network Usenet in March 1988 except that it corrects
+ *      some minor deficiencies that were found by users of version 2.0.
+ *      The only change within the measurement loop is that a
+ *      non-executed "else" part was added to the "if" statement in
+ *      Func_3, and a non-executed "else" part removed from Proc_3.
+ *
+ * Version C/2.2, Steven Pemberton, October 1993
+ *	Functionally, identical to version 2.2; the changes are in
+ *	how you compile and use it:
+ *	- Everything is in one file now, but compiled in 2 passes
+ *	- Compile (and run) by running the file through the shell: 'sh dhry.c"
+ *	- Uses the system definition of HZ if one can be found
+ *	- HZ must be defined, otherwise it won't compile (no defaults here)
+ *	- The (uninteresting) output is printed to stderr (dhry2 > /dev/null)
+ *	- The number of loops is passed as a parameter, rather than read
+ *	  (dhry2 500000)
+ *	- If the number of loops is insufficient to get a good result,
+ *	  it repeats it with loops*10 until it is enough (rather than just
+ *	  stopping)
+ *	- Output says which sort of clock it is using, and the HZ value
+ *	- You can use -DREG instead of the -DREG=register of previous versions
+ *	- Some stylistic cleanups.
+ *
+ ***************************************************************************
+ *
+ *  Compilation model and measurement (IMPORTANT):
+ *
+ *  The following "ground rules" apply for measurements:
+ *  - Separate compilation
+ *  - No procedure merging
+ *  - Otherwise, compiler optimizations are allowed but should be indicated
+ *  - Default results are those without register declarations
+ *  See the companion paper "Rationale for Dhrystone Version 2" for a more
+ *  detailed discussion of these ground rules.
+ *
+ *  For 16-Bit processors (e.g. 80186, 80286), times for all compilation
+ *  models ("small", "medium", "large" etc.) should be given if possible,
+ *  together with a definition of these models for the compiler system used.
+ *
+ **************************************************************************
+ *
+ *  Dhrystone (C version) statistics:
+ *
+ *  [Comment from the first distribution, updated for version 2.
+ *   Note that because of language differences, the numbers are slightly
+ *   different from the Ada version.]
+ *
+ *  The following program contains statements of a high level programming
+ *  language (here: C) in a distribution considered representative:
+ *
+ *    assignments                  52 (51.0 %)
+ *    control statements           33 (32.4 %)
+ *    procedure, function calls    17 (16.7 %)
+ *
+ *  103 statements are dynamically executed. The program is balanced with
+ *  respect to the three aspects:
+ *
+ *    - statement type
+ *    - operand type
+ *    - operand locality
+ *         operand global, local, parameter, or constant.
+ *
+ *  The combination of these three aspects is balanced only approximately.
+ *
+ *  1. Statement Type:
+ *  -----------------             number
+ *
+ *     V1 = V2                     9
+ *       (incl. V1 = F(..)
+ *     V = Constant               12
+ *     Assignment,                 7
+ *       with array element
+ *     Assignment,                 6
+ *       with record component
+ *                                --
+ *                                34       34
+ *
+ *     X = Y +|-|"&&"|"|" Z        5
+ *     X = Y +|-|"==" Constant     6
+ *     X = X +|- 1                 3
+ *     X = Y *|/ Z                 2
+ *     X = Expression,             1
+ *           two operators
+ *     X = Expression,             1
+ *           three operators
+ *                                --
+ *                                18       18
+ *
+ *     if ....                    14
+ *       with "else"      7
+ *       without "else"   7
+ *           executed        3
+ *           not executed    4
+ *     for ...                     7  |  counted every time
+ *     while ...                   4  |  the loop condition
+ *     do ... while                1  |  is evaluated
+ *     switch ...                  1
+ *     break                       1
+ *     declaration with            1
+ *       initialization
+ *                                --
+ *                                34       34
+ *
+ *     P (...)  procedure call    11
+ *       user procedure      10
+ *       library procedure    1
+ *     X = F (...)
+ *             function  call      6
+ *       user function        5
+ *       library function     1
+ *                                --
+ *                                17       17
+ *                                        ---
+ *                                        103
+ *
+ *    The average number of parameters in procedure or function calls
+ *    is 1.82 (not counting the function values aX *
+ *
+ *  2. Operators
+ *  ------------
+ *                          number    approximate
+ *                                    percentage
+ *
+ *    Arithmetic             32          50.8
+ *
+ *       +                     21          33.3
+ *       -                      7          11.1
+ *       *                      3           4.8
+ *       / (int div)            1           1.6
+ *
+ *    Comparison             27           42.8
+ *
+ *       ==                     9           14.3
+ *       /=                     4            6.3
+ *       >                      1            1.6
+ *       <                      3            4.8
+ *       >=                     1            1.6
+ *       <=                     9           14.3
+ *
+ *    Logic                   4            6.3
+ *
+ *       && (AND-THEN)          1            1.6
+ *       |  (OR)                1            1.6
+ *       !  (NOT)               2            3.2
+ *
+ *                           --          -----
+ *                           63          100.1
+ *
+ *
+ *  3. Operand Type (counted once per operand reference):
+ *  ---------------
+ *                          number    approximate
+ *                                    percentage
+ *
+ *     Integer               175        72.3 %
+ *     Character              45        18.6 %
+ *     Pointer                12         5.0 %
+ *     String30                6         2.5 %
+ *     Array                   2         0.8 %
+ *     Record                  2         0.8 %
+ *                           ---       -------
+ *                           242       100.0 %
+ *
+ *  When there is an access path leading to the final operand (e.g. a record
+ *  component), only the final data type on the access path is counted.
+ *
+ *
+ *  4. Operand Locality:
+ *  -------------------
+ *                                number    approximate
+ *                                          percentage
+ *
+ *     local variable              114        47.1 %
+ *     global variable              22         9.1 %
+ *     parameter                    45        18.6 %
+ *        value                        23         9.5 %
+ *        reference                    22         9.1 %
+ *     function result               6         2.5 %
+ *     constant                     55        22.7 %
+ *                                 ---       -------
+ *                                 242       100.0 %
+ *
+ *  The program does not compute anything meaningful, but it is syntactically
+ *  and semantically correct. All variables have a value assigned to them
+ *  before they are used as a source operand.
+ *
+ *  There has been no explicit effort to account for the effects of a
+ *  cache, or to balance the use of long or short displacements for code or
+ *  data.
+ *
+ ***************************************************************************
+ */
+
+/* Compiler and system dependent definitions: */
+
+/* variables for time measurement: */
+
+#include <am.h>
+#include <klib.h>
+#include <klib-macros.h>
+
+static uint32_t uptime_ms() { return io_read(AM_TIMER_UPTIME).us / 1000; }
+#define Start_Timer() Begin_Time = uptime_ms()
+#define Stop_Timer()  End_Time   = uptime_ms()
+
+#define NUMBER_OF_RUNS		500000 /* Default number of runs */
+#define PASS2
+
+#ifdef  NOSTRUCTASSIGN
+#define structassign(d, s)      memcpy(&(d), &(s), sizeof(d))
+#else
+#define structassign(d, s)      d = s
+#endif
+
+#ifdef  NOENUM
+#define Ident_1 0
+#define Ident_2 1
+#define Ident_3 2
+#define Ident_4 3
+#define Ident_5 4
+  typedef int   Enumeration;
+#else
+  typedef       enum    {Ident_1, Ident_2, Ident_3, Ident_4, Ident_5}
+                Enumeration;
+#endif
+        /* for boolean and enumeration types in Ada, Pascal */
+
+/* General definitions: */
+
+
+#define Null 0
+                /* Value of a Null pointer */
+
+typedef int     One_Thirty;
+typedef int     One_Fifty;
+typedef char    Capital_Letter;
+typedef int     Boolean;
+typedef char    Str_30 [31];
+typedef int     Arr_1_Dim [50];
+typedef int     Arr_2_Dim [50] [50];
+
+typedef struct record
+    {
+    struct record *Ptr_Comp;
+    Enumeration    Discr;
+    union {
+          struct {
+                  Enumeration Enum_Comp;
+                  int         Int_Comp;
+                  char        Str_Comp [31];
+                  } var_1;
+          struct {
+                  Enumeration E_Comp_2;
+                  char        Str_2_Comp [31];
+                  } var_2;
+          struct {
+                  char        Ch_1_Comp;
+                  char        Ch_2_Comp;
+                  } var_3;
+          } variant;
+      } Rec_Type, *Rec_Pointer;
+
+/* Global Variables: */
+
+Rec_Pointer     Ptr_Glob,
+                Next_Ptr_Glob;
+int             Int_Glob;
+Boolean         Bool_Glob;
+char            Ch_1_Glob,
+                Ch_2_Glob;
+int             Arr_1_Glob [50];
+int             Arr_2_Glob [50] [50];
+
+Enumeration     Func_1 ();
+  /* forward declaration necessary since Enumeration may not simply be int */
+
+#ifndef REG
+        Boolean Reg = false;
+#define REG
+        /* REG becomes defined as empty */
+        /* i.e. no register variables   */
+#else
+        Boolean Reg = true;
+#undef REG
+#define REG register
+#endif
+
+Boolean		Done;
+
+long            Begin_Time,
+                End_Time,
+                User_Time;
+float           Microseconds,
+                Dhrystones_Per_Second;
+
+/* end of variables for time measurement */
+
+static char memory[1024];
+static char *free_mem = &memory[0];
+
+static char* myalloc(size_t size) {
+  while ((unsigned long)free_mem % 4 != 0) free_mem ++;
+  char *ret = free_mem;
+  free_mem += size;
+  return ret;
+}
+
+void Proc_6 (Enumeration, Enumeration*);
+void Proc_3 (Rec_Pointer*);
+void Proc_7 (One_Fifty a, One_Fifty b, One_Fifty* c);
+Boolean Func_2 (Str_30, Str_30);
+void Proc_8(Arr_1_Dim, Arr_2_Dim, int, int);
+Boolean Func_3 (Enumeration);
+
+void Proc_1 (Ptr_Val_Par)
+/******************/
+
+REG Rec_Pointer Ptr_Val_Par;
+    /* executed once */
+{
+  REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp;
+                                        /* == Ptr_Glob_Next */
+  /* Local variable, initialized with Ptr_Val_Par->Ptr_Comp,    */
+  /* corresponds to "rename" in Ada, "with" in Pascal           */
+
+  structassign (*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob);
+  Ptr_Val_Par->variant.var_1.Int_Comp = 5;
+  Next_Record->variant.var_1.Int_Comp
+        = Ptr_Val_Par->variant.var_1.Int_Comp;
+  Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp;
+  Proc_3 (&Next_Record->Ptr_Comp);
+    /* Ptr_Val_Par->Ptr_Comp->Ptr_Comp
+                        == Ptr_Glob->Ptr_Comp */
+  if (Next_Record->Discr == Ident_1)
+    /* then, executed */
+  {
+    Next_Record->variant.var_1.Int_Comp = 6;
+    Proc_6 (Ptr_Val_Par->variant.var_1.Enum_Comp,
+           &Next_Record->variant.var_1.Enum_Comp);
+    Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp;
+    Proc_7 (Next_Record->variant.var_1.Int_Comp, 10,
+           &Next_Record->variant.var_1.Int_Comp);
+  }
+  else /* not executed */
+    structassign (*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp);
+} /* Proc_1 */
+
+
+void Proc_2 (Int_Par_Ref)
+/******************/
+    /* executed once */
+    /* *Int_Par_Ref == 1, becomes 4 */
+
+One_Fifty   *Int_Par_Ref;
+{
+  One_Fifty  Int_Loc;
+  Enumeration   Enum_Loc;
+
+  Int_Loc = *Int_Par_Ref + 10;
+  do /* executed once */
+    if (Ch_1_Glob == 'A')
+      /* then, executed */
+    {
+      Int_Loc -= 1;
+      *Int_Par_Ref = Int_Loc - Int_Glob;
+      Enum_Loc = Ident_1;
+    } /* if */
+  while (Enum_Loc != Ident_1); /* true */
+} /* Proc_2 */
+
+
+void Proc_3 (Ptr_Ref_Par)
+/******************/
+    /* executed once */
+    /* Ptr_Ref_Par becomes Ptr_Glob */
+
+Rec_Pointer *Ptr_Ref_Par;
+
+{
+  if (Ptr_Glob != Null)
+    /* then, executed */
+    *Ptr_Ref_Par = Ptr_Glob->Ptr_Comp;
+  Proc_7 (10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp);
+} /* Proc_3 */
+
+
+void Proc_4 () /* without parameters */
+/*******/
+    /* executed once */
+{
+  Boolean Bool_Loc;
+
+  Bool_Loc = Ch_1_Glob == 'A';
+  Bool_Glob = Bool_Loc | Bool_Glob;
+  Ch_2_Glob = 'B';
+} /* Proc_4 */
+
+
+void Proc_5 () /* without parameters */
+/*******/
+    /* executed once */
+{
+  Ch_1_Glob = 'A';
+  Bool_Glob = false;
+} /* Proc_5 */
+
+
+        /* Procedure for the assignment of structures,          */
+        /* if the C compiler doesn't support this feature       */
+#ifdef  NOSTRUCTASSIGN
+memcpy (d, s, l)
+register char   *d;
+register char   *s;
+register int    l;
+{
+        while (l--) *d++ = *s++;
+}
+#endif
+
+
+#ifndef REG
+#define REG
+        /* REG becomes defined as empty */
+        /* i.e. no register variables   */
+#else
+#undef REG
+#define REG register
+#endif
+
+extern  int     Int_Glob;
+extern  char    Ch_1_Glob;
+
+
+void Proc_6 (Enum_Val_Par, Enum_Ref_Par)
+/*********************************/
+    /* executed once */
+    /* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */
+
+Enumeration  Enum_Val_Par;
+Enumeration *Enum_Ref_Par;
+{
+  *Enum_Ref_Par = Enum_Val_Par;
+  if (! Func_3 (Enum_Val_Par))
+    /* then, not executed */
+    *Enum_Ref_Par = Ident_4;
+  switch (Enum_Val_Par)
+  {
+    case Ident_1:
+      *Enum_Ref_Par = Ident_1;
+      break;
+    case Ident_2:
+      if (Int_Glob > 100)
+        /* then */
+      *Enum_Ref_Par = Ident_1;
+      else *Enum_Ref_Par = Ident_4;
+      break;
+    case Ident_3: /* executed */
+      *Enum_Ref_Par = Ident_2;
+      break;
+    case Ident_4: break;
+    case Ident_5:
+      *Enum_Ref_Par = Ident_3;
+      break;
+  } /* switch */
+} /* Proc_6 */
+
+
+void Proc_7 (One_Fifty Int_1_Par_Val, One_Fifty Int_2_Par_Val, One_Fifty *Int_Par_Ref)
+{
+  One_Fifty Int_Loc;
+
+  Int_Loc = Int_1_Par_Val + 2;
+  *Int_Par_Ref = Int_2_Par_Val + Int_Loc;
+} /* Proc_7 */
+
+
+void Proc_8 (Arr_1_Par_Ref, Arr_2_Par_Ref, Int_1_Par_Val, Int_2_Par_Val)
+/*********************************************************************/
+    /* executed once      */
+    /* Int_Par_Val_1 == 3 */
+    /* Int_Par_Val_2 == 7 */
+Arr_1_Dim       Arr_1_Par_Ref;
+Arr_2_Dim       Arr_2_Par_Ref;
+int             Int_1_Par_Val;
+int             Int_2_Par_Val;
+{
+  REG One_Fifty Int_Index;
+  REG One_Fifty Int_Loc;
+
+  Int_Loc = Int_1_Par_Val + 5;
+  Arr_1_Par_Ref [Int_Loc] = Int_2_Par_Val;
+  Arr_1_Par_Ref [Int_Loc+1] = Arr_1_Par_Ref [Int_Loc];
+  Arr_1_Par_Ref [Int_Loc+30] = Int_Loc;
+  for (Int_Index = Int_Loc; Int_Index <= Int_Loc+1; ++Int_Index)
+    Arr_2_Par_Ref [Int_Loc] [Int_Index] = Int_Loc;
+  Arr_2_Par_Ref [Int_Loc] [Int_Loc-1] += 1;
+  Arr_2_Par_Ref [Int_Loc+20] [Int_Loc] = Arr_1_Par_Ref [Int_Loc];
+  Int_Glob = 5;
+} /* Proc_8 */
+
+
+Enumeration Func_1 (Ch_1_Par_Val, Ch_2_Par_Val)
+/*************************************************/
+    /* executed three times                                         */
+    /* first call:      Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R'    */
+    /* second call:     Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C'    */
+    /* third call:      Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C'    */
+
+Capital_Letter   Ch_1_Par_Val;
+Capital_Letter   Ch_2_Par_Val;
+{
+  Capital_Letter        Ch_1_Loc;
+  Capital_Letter        Ch_2_Loc;
+
+  Ch_1_Loc = Ch_1_Par_Val;
+  Ch_2_Loc = Ch_1_Loc;
+  if (Ch_2_Loc != Ch_2_Par_Val)
+    /* then, executed */
+    return (Ident_1);
+  else  /* not executed */
+  {
+    Ch_1_Glob = Ch_1_Loc;
+    return (Ident_2);
+   }
+} /* Func_1 */
+
+
+Boolean Func_2 (Str_1_Par_Ref, Str_2_Par_Ref)
+/*************************************************/
+    /* executed once */
+    /* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */
+    /* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */
+
+Str_30  Str_1_Par_Ref;
+Str_30  Str_2_Par_Ref;
+{
+  REG One_Thirty        Int_Loc;
+      Capital_Letter    Ch_Loc;
+
+  Int_Loc = 2;
+  while (Int_Loc <= 2) /* loop body executed once */
+    if (Func_1 (Str_1_Par_Ref[Int_Loc],
+                Str_2_Par_Ref[Int_Loc+1]) == Ident_1)
+      /* then, executed */
+    {
+      Ch_Loc = 'A';
+      Int_Loc += 1;
+    } /* if, while */
+  if (Ch_Loc >= 'W' && Ch_Loc < 'Z')
+    /* then, not executed */
+    Int_Loc = 7;
+  if (Ch_Loc == 'R') {
+    /* then, not executed */
+    return (true);
+  }
+  else /* executed */
+  {
+    if (strcmp (Str_1_Par_Ref, Str_2_Par_Ref) > 0)
+    {
+      Int_Loc += 7;
+      Int_Glob = Int_Loc;
+      return (true);
+    }
+    else /* executed */
+      return (false);
+  } /* if Ch_Loc */
+} /* Func_2 */
+
+
+Boolean Func_3 (Enum_Par_Val)
+/***************************/
+    /* executed once        */
+    /* Enum_Par_Val == Ident_3 */
+Enumeration Enum_Par_Val;
+{
+  Enumeration Enum_Loc;
+
+  Enum_Loc = Enum_Par_Val;
+  if (Enum_Loc == Ident_3)
+    /* then, executed */
+    return (true);
+  else /* not executed */
+    return (false);
+} /* Func_3 */
+
+
+Boolean pass = true;
+Boolean check(int cond) {
+  if (!cond) pass = false;
+  return cond;
+}
+int main ()
+/*****/
+
+  /* main program, corresponds to procedures        */
+  /* Main and Proc_0 in the Ada version             */
+{
+        One_Fifty       Int_1_Loc;
+  REG   One_Fifty       Int_2_Loc;
+        One_Fifty       Int_3_Loc;
+  REG   char            Ch_Index;
+        Enumeration     Enum_Loc;
+        Str_30          Str_1_Loc;
+        Str_30          Str_2_Loc;
+  REG   int             Run_Index;
+  REG   int             Number_Of_Runs;
+
+  ioe_init();
+
+ Number_Of_Runs = NUMBER_OF_RUNS;
+
+  /* Initializations */
+
+  Next_Ptr_Glob = (Rec_Pointer) myalloc (sizeof (Rec_Type));
+  Ptr_Glob = (Rec_Pointer) myalloc (sizeof (Rec_Type));
+
+  Ptr_Glob->Ptr_Comp                    = Next_Ptr_Glob;
+  Ptr_Glob->Discr                       = Ident_1;
+  Ptr_Glob->variant.var_1.Enum_Comp     = Ident_3;
+  Ptr_Glob->variant.var_1.Int_Comp      = 40;
+  strcpy (Ptr_Glob->variant.var_1.Str_Comp,
+          "DHRYSTONE PROGRAM, SOME STRING");
+  strcpy (Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING");
+
+  Arr_2_Glob [8][7] = 10;
+        /* Was missing in published program. Without this statement,    */
+        /* Arr_2_Glob [8][7] would have an undefined value.             */
+        /* Warning: With 16-Bit processors and Number_Of_Runs > 32000,  */
+        /* overflow may occur for this array element.                   */
+
+  printf ("Dhrystone Benchmark, Version %s\n", Version);
+
+  Done = false;
+  while (!Done) {
+
+    printf ("Trying %d runs through Dhrystone.\n", Number_Of_Runs);
+
+    /***************/
+    /* Start timer */
+    /***************/
+
+    Start_Timer();
+
+    for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index)
+    {
+
+      Proc_5();
+      Proc_4();
+	/* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */
+      Int_1_Loc = 2;
+      Int_2_Loc = 3;
+      strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING");
+      Enum_Loc = Ident_2;
+      Bool_Glob = ! Func_2 (Str_1_Loc, Str_2_Loc);
+	/* Bool_Glob == 1 */
+      while (Int_1_Loc < Int_2_Loc)  /* loop body executed once */
+      {
+	Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc;
+	  /* Int_3_Loc == 7 */
+	Proc_7 (Int_1_Loc, Int_2_Loc, &Int_3_Loc);
+	  /* Int_3_Loc == 7 */
+	Int_1_Loc += 1;
+      } /* while */
+	/* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
+      Proc_8 (Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc);
+	/* Int_Glob == 5 */
+      Proc_1 (Ptr_Glob);
+      for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index)
+			       /* loop body executed twice */
+      {
+	if (Enum_Loc == Func_1 (Ch_Index, 'C'))
+	    /* then, not executed */
+	  {
+	  Proc_6 (Ident_1, &Enum_Loc);
+	  strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING");
+	  Int_2_Loc = Run_Index;
+	  Int_Glob = Run_Index;
+	  }
+      }
+	/* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */
+      Int_2_Loc = Int_2_Loc * Int_1_Loc;
+      Int_1_Loc = Int_2_Loc / Int_3_Loc;
+      Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc;
+	/* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */
+      Proc_2 (&Int_1_Loc);
+	/* Int_1_Loc == 5 */
+
+    } /* loop "for Run_Index" */
+
+    /**************/
+    /* Stop timer */
+    /**************/
+
+    Stop_Timer();
+
+    User_Time = End_Time - Begin_Time;
+
+    Done = true;
+  }
+
+  if (!check(Int_Glob == 5)) {
+    printf("Int_Glob:            %d\n", Int_Glob);
+    printf("        should be:   %d\n", 5);
+  }
+  if (!check(Bool_Glob == 1)) {
+    printf("Bool_Glob:           %d\n", Bool_Glob);
+    printf("        should be:   %d\n", 1);
+  }
+  if (!check(Ch_1_Glob == 'A')) {
+    printf("Ch_1_Glob:           %c\n", Ch_1_Glob);
+    printf("        should be:   %c\n", 'A');
+  }
+  if (!check(Ch_2_Glob == 'B')) {
+    printf("Ch_2_Glob:           %c\n", Ch_2_Glob);
+    printf("        should be:   %c\n", 'B');
+  }
+  if (!check(Arr_1_Glob[8] == 7)) {
+    printf("Arr_1_Glob[8]:       %d\n", Arr_1_Glob[8]);
+    printf("        should be:   %d\n", 7);
+  }
+  if (!check(Arr_2_Glob[8][7] == Number_Of_Runs + 10)) {
+    printf("Arr_2_Glob[8][7]:    %d\n", Arr_2_Glob[8][7]);
+    printf("        should be:   Number_Of_Runs + 10\n");
+  }
+
+  if (!check((int)Ptr_Glob->Discr == 0)) {
+    printf("Ptr_Glob->Discr:             %d\n", Ptr_Glob->Discr);
+    printf("        should be:   %d\n", 0);
+  }
+  if (!check(Ptr_Glob->variant.var_1.Enum_Comp == 2)) {
+    printf("Ptr_Glob->Enum_Comp:         %d\n", Ptr_Glob->variant.var_1.Enum_Comp);
+    printf("        should be:   %d\n", 2);
+  }
+  if (!check(Ptr_Glob->variant.var_1.Int_Comp == 17)) {
+    printf("Ptr_Glob->Int_Comp:          %d\n", Ptr_Glob->variant.var_1.Int_Comp);
+    printf("        should be:   %d\n", 17);
+  }
+  if (!check(strcmp(Ptr_Glob->variant.var_1.Str_Comp, "DHRYSTONE PROGRAM, SOME STRING") == 0)) {
+    printf("Ptr_Glob->Str_Comp:          %s\n", Ptr_Glob->variant.var_1.Str_Comp);
+    printf("        should be:   DHRYSTONE PROGRAM, SOME STRING\n");
+  }
+
+  if (!check((int)Next_Ptr_Glob->Discr == 0)) {
+    printf("Next_Ptr_Glob->Discr:             %d\n", Next_Ptr_Glob->Discr);
+    printf("        should be:   %d\n", 0);
+  }
+  if (!check(Next_Ptr_Glob->variant.var_1.Enum_Comp == 1)) {
+    printf("Next_Ptr_Glob->Enum_Comp:         %d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp);
+    printf("        should be:   %d\n", 1);
+  }
+  if (!check(Next_Ptr_Glob->variant.var_1.Int_Comp == 18)) {
+    printf("Next_Ptr_Glob->Int_Comp:          %d\n", Next_Ptr_Glob->variant.var_1.Int_Comp);
+    printf("        should be:   %d\n", 18);
+  }
+  if (!check(strcmp(Next_Ptr_Glob->variant.var_1.Str_Comp, "DHRYSTONE PROGRAM, SOME STRING") == 0)) {
+    printf("Next_Ptr_Glob->Str_Comp:          %s\n", Next_Ptr_Glob->variant.var_1.Str_Comp);
+    printf("        should be:   DHRYSTONE PROGRAM, SOME STRING\n");
+  }
+
+  if (!check(Int_1_Loc == 5)) {
+    printf("Int_1_Loc:           %d\n", Int_1_Loc);
+    printf("        should be:   %d\n", 5);
+  }
+  if (!check(Int_2_Loc == 13)) {
+    printf("Int_2_Loc:           %d\n", Int_2_Loc);
+    printf("        should be:   %d\n", 13);
+  }
+  if (!check(Int_3_Loc == 7)) {
+    printf("Int_3_Loc:           %d\n", Int_3_Loc);
+    printf("        should be:   %d\n", 7);
+  }
+  if (!check(Enum_Loc == 1)) {
+    printf("Enum_Loc:            %d\n", Enum_Loc);
+    printf("        should be:   %d\n", 1);
+  }
+
+  if (!check(strcmp(Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING") == 0)) {
+    printf("Str_1_Loc:           %s\n", Str_1_Loc);
+    printf("        should be:   DHRYSTONE PROGRAM, 1'ST STRING\n");
+  }
+  if (!check(strcmp(Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING") == 0)) {
+    printf("Str_2_Loc:           %s\n", Str_2_Loc);
+    printf("        should be:   DHRYSTONE PROGRAM, 2'ND STRING\n");
+  }
+
+  printf ("Finished in %d ms\n", (int)User_Time);
+  printf("==================================================\n");
+  printf("Dhrystone %s         %d Marks\n", pass ? "PASS" : "FAIL",
+      880900 / (int)User_Time * NUMBER_OF_RUNS/ 500000);
+  printf("                   vs. 100000 Marks (i7-7700K @ 4.20GHz)\n");
+
+  return (pass ? 0 : 1);
+}
+
+
--- a/am-kernels/benchmarks/microbench/Makefile
+++ b/am-kernels/benchmarks/microbench/Makefile
@ -0,0 +1,3 @@
+NAME = microbench
+SRCS = $(shell find src/ -name "*.c" -o -name "*.cc")
+include $(AM_HOME)/Makefile
--- a/am-kernels/benchmarks/microbench/README.md
+++ b/am-kernels/benchmarks/microbench/README.md
@ -0,0 +1,68 @@
+# MicroBench
+
+CPU正确性和性能测试用基准程序。对AbstractMachine的要求：
+
+1. 需要实现TRM和IOE的API。
+2. 在IOE的全部实现均留空的情况下仍可运行。如果有正确实现的`AM_TIMER_UPTIME`，可以输出正确的统计时间。若这个功能没有实现(返回`0`)，仍可进行正确性测试。
+3. 使用`putch(ch)`输出。
+4. 堆区`heap`必须初始化(堆区可为空)。如果`heap.start == heap.end`，即分配了空的堆区，只能运行不使用堆区的测试程序。每个基准程序会预先指定堆区的大小，堆区不足的基准程序将被忽略。
+
+## 使用方法
+
+同一组程序分成四组：test，train，ref和huge。
+
+| 名称  | 动态指令数  | 计时 | 计分 | 建议使用场景  |
+| ----- | ----------- | ---- | ---- | ----- |
+| test  | 约300K      |  X   |  X   | 正确性测试  |
+| train | 约60M       |  O   |  X   | 在RTL仿真环境中研究微结构行为 |
+| ref   | 约2B        |  O   |  O   | 在模拟器或FPGA环境中评估处理器性能 |
+| huge  | 约50B       |  O   |  O   | 衡量高性能处理器(如真机)的性能 |
+
+默认运行ref数据规模，可通过`mainargs`选择其它的数据规模, 如:
+```bash
+make ARCH=native run mainargs=huge
+```
+
+## 评分根据
+
+每个benchmark都记录以`REF_CPU`为基础测得的运行时间微秒数。每个benchmark的评分是相对于`REF_CPU`的运行速度，与基准处理器一样快的得分为`REF_SCORE=100000`。
+
+所有benchmark的平均得分是整体得分。
+
+## 已有的基准程序
+
+| 名称    | 描述                                | ref堆区使用  | huge堆区使用 |
+| ----- | -------------------------------------------- | ----- |  ----- |
+| qsort | 快速排序随机整数数组                         | 640KB | 16MB  |
+| queen | 位运算实现的n皇后问题                        | 0     | 0     |
+| bf    | Brainf**k解释器，快速排序输入的字符串        | 32KB  | 32KB  |
+| fib   | Fibonacci数列f(n)=f(n-1)+…+f(n-m)的矩阵求解  | 256KB | 2MB   |
+| sieve | Eratosthenes筛法求素数                       | 2MB   | 10MB  |
+| 15pz  | A*算法求解4x4数码问题                        | 2MB   | 64MB  |
+| dinic | Dinic算法求解二分图最大流                    | 680KB | 2MB   |
+| lzip  | Lzip数据压缩                                 | 4MB   | 64MB  |
+| ssort | Skew算法后缀排序                             | 4MB   | 64MB  |
+| md5   | 计算长随机字符串的MD5校验和                  | 10MB  | 64MB  |
+
+## 增加一个基准程序`foo`
+
+在`src/`目录下建立名为`foo`的目录，将源代码文件放入。
+
+每个基准程序需要实现三个函数：
+
+* `void bench_foo_prepare();`：进行准备工作，如初始化随机数种子、为数组分配内存等。运行时环境不保证全局变量和堆区的初始值，因此基准程序使用的全局数据必须全部初始化。
+* `void bench_foo_run();`：实际运行基准程序。只有这个函数会被计时。
+* `int bench_foo_validate();`：验证基准程序运行结果。正确返回1，错误返回0。
+
+在`benchmark.h`的`BENCHMARK_LIST`中增加相应的`def`项，格式参考已有的benchmark。
+
+## 基准程序可以使用的库函数
+
+虽然klib中提供了一些函数，但不同的klib实现会导致性能测试结果有差异。
+因此MicroBench中内置一些简单的库函数:
+
+* `bench_memcpy(void *dst, const void *src, size_t n)`: 内存复制。
+* `bench_srand(uint seed)`：用seed初始化随机数种子。
+* `bench_rand()`：返回一个0..32767之间的随机数。
+* `bench_alloc`/`bench_free`：内存分配/回收。目前回收是空操作。
+
--- a/am-kernels/benchmarks/microbench/include/benchmark.h
+++ b/am-kernels/benchmarks/microbench/include/benchmark.h
@ -0,0 +1,123 @@
+#ifndef __BENCHMARK_H__
+#define __BENCHMARK_H__
+
+#include <am.h>
+#include <klib.h>
+#include <klib-macros.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MB * 1024 * 1024
+#define KB * 1024
+
+#define REF_CPU    "i9-9900K @ 3.60GHz"
+#define REF_SCORE  100000
+
+#define REPEAT  1
+
+//                  size |  heap | time |  checksum
+#define QSORT_S {     100,   1 KB,     0, 0x08467105}
+#define QSORT_M {   30000, 128 KB,     0, 0xa3e99fe4}
+#define QSORT_L {  100000, 640 KB,  4404, 0xed8cff89}
+#define QSORT_H { 4000000,  16 MB,227620, 0xe6178735}
+#define QUEEN_S {       8,   0 KB,     0, 0x0000005c}
+#define QUEEN_M {      11,   0 KB,     0, 0x00000a78}
+#define QUEEN_L {      12,   0 KB,  4069, 0x00003778}
+#define QUEEN_H {      15,   0 KB,819996, 0x0022c710}
+#define    BF_S {       2,  32 KB,     0, 0xa6f0079e}
+#define    BF_M {      25,  32 KB,     0, 0xa88f8a65}
+#define    BF_L {     180,  32 KB, 16815, 0x9221e2b3}
+#define    BF_H {    1360,  32 KB,771535, 0xdb49fbde}
+#define   FIB_S {       2,   1 KB,     0, 0x7cfeddf0}
+#define   FIB_M {      23,  16 KB,     0, 0x94ad8800}
+#define   FIB_L {      91, 256 KB, 20168, 0xebdc5f80}
+#define   FIB_H {     300,   2 MB,775012, 0xe30a6f00}
+#define SIEVE_S {     100,   1 KB,     0, 0x00000019}
+#define SIEVE_M {  200000,  32 KB,     0, 0x00004640}
+#define SIEVE_L {10000000,   2 MB, 34823, 0x000a2403}
+#define SIEVE_H {80000000,  10 MB,301058, 0x00473fc6}
+#define  PZ15_S {       0,   1 KB,     0, 0x00000006}
+#define  PZ15_M {       1, 256 KB,     0, 0x0000b0df}
+#define  PZ15_L {       2,   2 MB,  5360, 0x00068b8c}
+#define  PZ15_H {       3,  64 MB,300634, 0x01027b4a}
+#define DINIC_S {      10,   8 KB,     0, 0x0000019c}
+#define DINIC_M {      80, 512 KB,     0, 0x00004f99}
+#define DINIC_L {     128, 680 KB,  8182, 0x0000c248}
+#define DINIC_H {     190,   2 MB,671978, 0x00014695}
+#define  LZIP_S {     128, 128 KB,     0, 0xe05fc832}
+#define  LZIP_M {   50000,   1 MB,     0, 0xdc93e90c}
+#define  LZIP_L { 1048576,   4 MB,  6795, 0x8d62c81f}
+#define  LZIP_H {31457280,  64 MB,199541, 0x1b859d76}
+#define SSORT_S {     100,   4 KB,     0, 0x4c555e09}
+#define SSORT_M {   10000, 512 KB,     0, 0x0db7909b}
+#define SSORT_L {  100000,   4 MB,  4002, 0x4f0ab431}
+#define SSORT_H { 3000000,  64 MB,322232, 0xeddbd9b6}
+#define   MD5_S {     100,   1 KB,     0, 0xf902f28f}
+#define   MD5_M {  200000, 256 KB,     0, 0xd4f9bc6d}
+#define   MD5_L {10000000,  10 MB, 15199, 0x27286a42}
+#define   MD5_H {64000000,  64 MB, 97148, 0x41ab4d60}
+
+#define BENCHMARK_LIST(def) \
+  def(qsort, "qsort", QSORT_S, QSORT_M, QSORT_L, QSORT_H, "Quick sort") \
+  def(queen, "queen", QUEEN_S, QUEEN_M, QUEEN_L, QUEEN_H, "Queen placement") \
+  def(   bf,    "bf",    BF_S,    BF_M,    BF_L,    BF_H, "Brainf**k interpreter") \
+  def(  fib,   "fib",   FIB_S,   FIB_M,   FIB_L,   FIB_H, "Fibonacci number") \
+  def(sieve, "sieve", SIEVE_S, SIEVE_M, SIEVE_L, SIEVE_H, "Eratosthenes sieve") \
+  def( 15pz,  "15pz",  PZ15_S,  PZ15_M,  PZ15_L,  PZ15_H, "A* 15-puzzle search") \
+  def(dinic, "dinic", DINIC_S, DINIC_M, DINIC_L, DINIC_H, "Dinic's maxflow algorithm") \
+  def( lzip,  "lzip",  LZIP_S,  LZIP_M,  LZIP_L,  LZIP_H, "Lzip compression") \
+  def(ssort, "ssort", SSORT_S, SSORT_M, SSORT_L, SSORT_H, "Suffix sort") \
+  def(  md5,   "md5",   MD5_S,   MD5_M,   MD5_L,   MD5_H, "MD5 digest") \
+
+// Each benchmark will run REPEAT times
+
+#define DECL(_name, _sname, _s, _m, _l, _h, _desc) \
+  void bench_##_name##_prepare(); \
+  void bench_##_name##_run(); \
+  int bench_##_name##_validate();
+
+BENCHMARK_LIST(DECL)
+
+typedef struct Setting {
+  int size;
+  unsigned long mlim, ref;
+  uint32_t checksum;
+} Setting;
+
+typedef struct Benchmark {
+  void (*prepare)();
+  void (*run)();
+  int (*validate)();
+  const char *name, *desc;
+  Setting settings[4];
+} Benchmark;
+
+extern Benchmark *current;
+extern Setting *setting;
+
+typedef struct Result {
+  int pass;
+  uint64_t usec;
+} Result;
+
+void prepare(Result *res);
+void done(Result *res);
+
+// memory allocation
+void* bench_alloc(size_t size);
+void bench_free(void *ptr);
+
+// random number generator
+void bench_srand(uint32_t seed);
+uint32_t bench_rand(); // return a random number between 0..32767
+
+// checksum
+uint32_t checksum(void *start, void *end);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/am-kernels/benchmarks/microbench/src/15pz/15pz.cc
+++ b/am-kernels/benchmarks/microbench/src/15pz/15pz.cc
@ -0,0 +1,96 @@
+#include <benchmark.h>
+#include "puzzle.h"
+#include "heap.h"
+
+const int N = 4;
+
+static int PUZZLE_S[N*N] = {
+  1, 2, 3, 4,
+  5, 6, 7, 8,
+  9, 10, 0, 11,
+  13, 14, 15, 12,
+};
+
+static int PUZZLE_M[N*N] = {
+  1, 2, 3, 4,
+  5, 6, 7, 8,
+  12, 0, 14, 13,
+  11, 15, 10, 9,
+};
+
+static int PUZZLE_L[N*N] = {
+  0, 2, 3, 4,
+  9, 6, 7, 8,
+  5, 11, 10, 12,
+  1, 15, 13, 14,
+};
+
+static int PUZZLE_H[N*N] = {
+  2, 6, 8, 0,
+  9, 15, 4, 12,
+  5, 13, 11,14,
+  1, 7, 3, 10,
+};
+
+static int ans;
+
+extern "C" {
+
+void bench_15pz_prepare() {
+}
+
+void bench_15pz_run() {
+  N_puzzle<N> puzzle;
+  int MAXN;
+
+  switch (setting->size) {
+    case 0: puzzle = N_puzzle<N>(PUZZLE_S); MAXN = 10; break;
+    case 1: puzzle = N_puzzle<N>(PUZZLE_M); MAXN = 2048; break;
+    case 2: puzzle = N_puzzle<N>(PUZZLE_L); MAXN = 16384; break;
+    case 3: puzzle = N_puzzle<N>(PUZZLE_H); MAXN = 786432; break;
+    default: assert(0);
+  }
+  assert(puzzle.solvable());
+
+  auto *heap = (Updatable_heap<N_puzzle<N>> *) bench_alloc(sizeof(Updatable_heap<N_puzzle<N>>));
+  heap->init(MAXN);
+  heap->push( puzzle, 0 );
+
+  int n = 0;
+  ans = -1;
+
+  while( heap->size() != 0 && n != MAXN ) {
+    N_puzzle<N> top = heap->pop();
+    ++n;
+
+    if ( top == N_puzzle<N>::solution() ) {
+      // We are done
+      ans = heap->length(top) * n;
+      return;
+    }
+
+    if ( top.tile_left_possible() ) {
+      heap->push( top.tile_left(), heap->length( top ) + 1 );
+    }
+
+    if ( top.tile_right_possible() ) {
+      heap->push( top.tile_right(), heap->length( top ) + 1 );
+    }
+
+    if ( top.tile_up_possible() ) {
+      heap->push( top.tile_up(), heap->length( top ) + 1 );
+    }
+
+    if ( top.tile_down_possible() ) {
+      heap->push( top.tile_down(), heap->length( top ) + 1 );
+    }
+  }
+}
+
+
+int bench_15pz_validate() {
+  return (uint32_t)ans == setting->checksum;
+}
+
+}
+
--- a/am-kernels/benchmarks/microbench/src/15pz/heap.h
+++ b/am-kernels/benchmarks/microbench/src/15pz/heap.h
@ -0,0 +1,227 @@
+// Author:  Douglas Wilhelm Harder
+// Copyright (c) 2009 by Douglas Wilhelm Harder.  All rights reserved.
+
+template <typename T>
+T max(T a, T b) {
+  return a > b ? a : b;
+}
+
+template <typename T>
+class Updatable_heap {
+  private:
+    int M;
+    class Step;
+    Step **hash_table;
+    Step **heap;
+    int heap_size;
+    int maximum_heap_size;
+
+    void inline swap( int, int );
+    void percolate_down();
+    void percolate_up( int );
+    Step *pointer( T const & ) const;
+
+  public:
+    void init(int m);
+    ~Updatable_heap();
+    T pop();
+    void push( T const &, int );
+    int size() const;
+    int maximum_size() const;
+    int length( T const & ) const;
+};
+
+template <typename T>
+class Updatable_heap<T>::Step {
+  public:
+    T element;
+    Step *next;
+    int heap_index;
+    int path_length;
+    int path_weight;
+    bool visited;
+    Step *previous_step;
+
+    void init( T const &, Step *, int, int );
+    int length() const;
+    int weight() const;
+};
+
+template <typename T>
+void Updatable_heap<T>::init(int m) {
+  M = m;
+  heap = (Step **)bench_alloc(sizeof(void *) * M);
+  hash_table = (Step **)bench_alloc(sizeof(void *) * (M + 1));
+
+  heap_size = 0;
+  maximum_heap_size = 0;
+  for ( int i = 0; i < M; ++i ) {
+    hash_table[i] = 0;
+  }
+}
+
+template <typename T>
+Updatable_heap<T>::~Updatable_heap() {
+  for ( int i = 0; i < M; ++i ) {
+    Step *ptr = hash_table[i];
+
+    while ( ptr != 0 ) {
+      Step *tmp = ptr;
+      ptr = ptr->next;
+    }
+  }
+}
+
+template <typename T>
+T Updatable_heap<T>::pop() {
+  if ( size() == 0 ) {
+    return T();
+  }
+
+  T top = heap[1]->element;
+
+  if ( size() == 1 ) {
+    heap_size = 0;
+  } else {
+    assert( size() > 1 );
+
+    heap[1] = heap[size()];
+    heap[1]->heap_index = 1;
+
+    --heap_size;
+    percolate_down();
+  }
+
+  return top;
+}
+
+template <typename T>
+void inline Updatable_heap<T>::swap( int i, int j ) {
+  Step *tmp = heap[j];
+  heap[j] = heap[i];
+  heap[i] = tmp;
+
+  heap[i]->heap_index = i;
+  heap[j]->heap_index = j;
+}
+
+template <typename T>
+void Updatable_heap<T>::percolate_down() {
+  int n = 1;
+
+  while ( 2*n + 1 <= size() ) {
+    if ( heap[n]->weight() < heap[2*n]->weight() && heap[n]->weight() < heap[2*n + 1]->weight() ) {
+      return;
+    }
+
+    if ( heap[2*n]->weight() < heap[2*n + 1]->weight() ) {
+      swap( n, 2*n );
+      n = 2*n;
+    } else {
+      assert( heap[2*n]->weight() >= heap[2*n + 1]->weight() );
+
+      swap( n, 2*n + 1 );
+      n = 2*n + 1;
+    }
+  }
+
+  if ( 2*n == size() &&  heap[2*n]->weight() < heap[n]->weight() ) {
+    swap( n, 2*n );
+  }
+}
+
+template <typename T>
+void Updatable_heap<T>::percolate_up( int n ) {
+  while ( n != 1 ) {
+    int parent = n/2;
+
+    if ( heap[parent]->weight() > heap[n]->weight() ) {
+      swap( parent, n );
+      n = parent;
+    } else {
+      return;
+    }
+  }
+}
+
+template <typename T>
+void Updatable_heap<T>::push( T const &pz, int path_length ) {
+  Step *ptr = pointer( pz );
+
+  if ( ptr == 0 ) {
+    assert( heap_size <= M );
+    ++heap_size;
+
+    Step *ptr = (Step*)bench_alloc(sizeof(Step));
+    ptr->init( pz, hash_table[pz.hash() & (M - 1)], size(), path_length );
+    hash_table[pz.hash() & (M - 1)] = ptr;
+    heap[size()] = ptr;
+
+    percolate_up( size() );
+
+    maximum_heap_size = max( maximum_heap_size, size() );
+  } else {
+    if ( !ptr->visited ) {
+      if ( path_length + ptr->element.lower_bound() < ptr->weight() ) {
+        ptr->path_weight = path_length + ptr->element.lower_bound();
+        percolate_up( ptr->heap_index );
+      }
+    }
+  }
+}
+
+template <typename T>
+int Updatable_heap<T>::size() const {
+  return heap_size;
+}
+
+template <typename T>
+int Updatable_heap<T>::maximum_size() const {
+  return maximum_heap_size;
+}
+
+template <typename T>
+int Updatable_heap<T>::length( T const &pz ) const {
+  Step *ptr = pointer( pz );
+
+  return ( ptr == 0 ) ? 2147483647 : ptr->length();
+}
+
+template <typename T>
+typename Updatable_heap<T>::Step *Updatable_heap<T>::pointer( T const &pz ) const {
+  for ( Step *ptr = hash_table[pz.hash() & (M - 1)]; ptr != 0; ptr = ptr->next ) {
+    if ( ptr->element == pz ) {
+      return ptr;
+    }
+  }
+
+  return 0;
+}
+
+/****************************************************
+ * ************************************************ *
+ * *                   Iterator                   * *
+ * ************************************************ *
+ ****************************************************/
+
+template <typename T>
+void Updatable_heap<T>::Step::init( T const &pz, Step *n, int hi, int dist ) {
+  element = pz;
+  next = n;
+  heap_index = hi;
+  path_length = dist;
+  path_weight = dist + element.lower_bound();
+  visited = false;
+  previous_step = 0;
+}
+
+template <typename T>
+int Updatable_heap<T>::Step::length() const {
+  return path_length;
+}
+
+template <typename T>
+int Updatable_heap<T>::Step::weight() const {
+  return path_weight;
+}
+
--- a/am-kernels/benchmarks/microbench/src/15pz/puzzle.h
+++ b/am-kernels/benchmarks/microbench/src/15pz/puzzle.h
@ -0,0 +1,475 @@
+// Author:  Douglas Wilhelm Harder
+// Copyright (c) 2009 by Douglas Wilhelm Harder.  All rights reserved.
+// Url: https://ece.uwaterloo.ca/~dwharder/aads/Algorithms/N_puzzles/
+
+template <int N>
+class N_puzzle {
+  private:
+    bool puzzle_valid;
+    uint8_t zero_i, zero_j;
+    int8_t manhattan_distance;
+    int8_t puzzle[N][N];
+    int hash_value;
+
+    void determine_hash();
+
+    static int abs( int n ) { return ( n < 0 ) ? -n : n; }
+
+  public:
+    N_puzzle();
+    N_puzzle( int array[N*N] );
+    N_puzzle( N_puzzle const & );
+    N_puzzle &operator=( N_puzzle const & );
+
+    bool solvable() const;
+    bool valid() const;
+    int lower_bound() const;
+    unsigned int hash() const;
+
+    bool tile_up_possible() const;
+    bool tile_down_possible() const;
+    bool tile_left_possible() const;
+    bool tile_right_possible() const;
+
+    N_puzzle tile_up() const;
+    N_puzzle tile_down() const;
+    N_puzzle tile_left() const;
+    N_puzzle tile_right() const;
+
+    bool operator==( N_puzzle const & ) const;
+    bool operator!=( N_puzzle const & ) const;
+
+    N_puzzle static solution();
+};
+
+template < int N >
+N_puzzle<N>::N_puzzle():
+puzzle_valid( true ),
+manhattan_distance( 0 ) {
+  int array[N*N];
+
+  for ( int i = 0; i < N*N; ++i ) {
+    array[i] = i;
+  }
+
+  int n = 0;
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      int k = bench_rand() % (N*N - n);
+      puzzle[i][j] = array[k];
+
+      if ( array[k] == 0 ) {
+        zero_i = i;
+        zero_j = j;
+      } else {
+        manhattan_distance += abs( ((array[k] - 1) / N) - i );
+        manhattan_distance += abs( ((array[k] - 1) % N) - j );
+      }
+
+      ++n;
+      array[k] = array[N*N - n];
+    }
+  }
+
+  determine_hash();
+}
+
+template < int N >
+N_puzzle<N>::N_puzzle( int array[N*N] ):
+puzzle_valid( true ),
+manhattan_distance( 0 ) {
+  bool check[N*N];
+
+  for ( int i = 0; i < N*N; ++i ) {
+    check[i] = false;
+  }
+
+  int n = 0;
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      puzzle[i][j] = array[n];
+      check[array[n]] = true;
+
+      if ( array[n] == 0 ) {
+        zero_i = i;
+        zero_j = j;
+      } else {
+        manhattan_distance += abs( ((array[n] - 1) / N) - i );
+        manhattan_distance += abs( ((array[n] - 1) % N) - j );
+      }
+
+      ++n;
+    }
+  }
+
+  for ( int i = 0; i < N*N; ++i ) {
+    if ( !check[i] ) {
+      puzzle_valid = false;
+      return;
+    }
+  }
+
+  determine_hash();
+}
+
+/*
+ * Determine a hash value for the puzzle.
+ */
+
+template < int N >
+void N_puzzle<N>::determine_hash() {
+  hash_value = 0;
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      hash_value = hash_value*1973 + puzzle[i][j];
+    }
+  }
+}
+
+template < int N >
+N_puzzle<N>::N_puzzle( N_puzzle const &pz ):
+puzzle_valid( pz.puzzle_valid ),
+zero_i( pz.zero_i ),
+zero_j( pz.zero_j ),
+manhattan_distance( pz.manhattan_distance ),
+hash_value( pz.hash_value ) {
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      puzzle[i][j] = pz.puzzle[i][j];
+    }
+  }
+}
+
+template < int N >
+N_puzzle<N> &N_puzzle<N>::operator=( N_puzzle const &rhs ) {
+  puzzle_valid = rhs.puzzle_valid;
+  zero_i = rhs.zero_i;
+  zero_j = rhs.zero_j;
+  manhattan_distance = rhs.manhattan_distance;
+  hash_value = rhs.hash_value;
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      puzzle[i][j] = rhs.puzzle[i][j];
+    }
+  }
+  return *this;
+}
+
+
+/*
+ *  Moving a tile up is possible as long as
+ *  the blank is not in the last row.
+ */
+
+template <int N>
+bool N_puzzle<N>::tile_up_possible() const {
+  return puzzle_valid && (zero_i != N - 1);
+}
+
+/*
+ *  Moving a tile down is possible as long as
+ *  the blank is not in the first row.
+ */
+
+template <int N>
+bool N_puzzle<N>::tile_down_possible() const {
+  return puzzle_valid && (zero_i != 0);
+}
+
+/*
+ *  Moving a tile left is possible as long as
+ *  the blank is not in the last column.
+ */
+
+template <int N>
+bool N_puzzle<N>::tile_left_possible() const {
+  return puzzle_valid && (zero_j != N - 1);
+}
+
+/*
+ *  Moving a tile right is possible as long as
+ *  the blank is not in the first column.
+ */
+
+template <int N>
+bool N_puzzle<N>::tile_right_possible() const {
+  return puzzle_valid && (zero_j != 0);
+}
+
+template <int N>
+N_puzzle<N> N_puzzle<N>::tile_up() const {
+  if ( !puzzle_valid ) {
+    return *this;
+  }
+
+  N_puzzle result( *this );
+
+  if ( zero_i == N - 1 ) {
+    result.puzzle_valid = false;
+    return result;
+  }
+
+  result.manhattan_distance +=
+    abs( ((puzzle[zero_i + 1][zero_j] - 1) / N) - zero_i ) -
+    abs( ((puzzle[zero_i + 1][zero_j] - 1) / N) - (zero_i + 1) );
+
+  result.puzzle[zero_i][zero_j] = puzzle[zero_i + 1][zero_j];
+  ++result.zero_i;
+  result.puzzle[result.zero_i][zero_j] = 0;
+
+  result.determine_hash();
+
+  return result;
+}
+
+template <int N>
+N_puzzle<N> N_puzzle<N>::tile_down() const {
+  if ( !puzzle_valid ) {
+    return *this;
+  }
+
+  N_puzzle result( *this );
+
+  if ( zero_i == 0 ) {
+    result.puzzle_valid = false;
+    return result;
+  }
+
+  result.manhattan_distance +=
+    abs( ((puzzle[zero_i - 1][zero_j] - 1) / N) - zero_i ) -
+    abs( ((puzzle[zero_i - 1][zero_j] - 1) / N) - (zero_i - 1) );
+
+  result.puzzle[zero_i][zero_j] = puzzle[zero_i - 1][zero_j];
+  --result.zero_i;
+  result.puzzle[result.zero_i][zero_j] = 0;
+
+  result.determine_hash();
+
+  return result;
+}
+
+template <int N>
+N_puzzle<N> N_puzzle<N>::tile_left() const {
+  if ( !puzzle_valid ) {
+    return *this;
+  }
+
+  N_puzzle result( *this );
+
+  if ( zero_j == N - 1 ) {
+    result.puzzle_valid = false;
+    return result;
+  }
+
+  result.manhattan_distance +=
+    abs( ((puzzle[zero_i][zero_j + 1] - 1) % N) - zero_j ) -
+    abs( ((puzzle[zero_i][zero_j + 1] - 1) % N) - (zero_j + 1) );
+
+  result.puzzle[zero_i][zero_j] = puzzle[zero_i][zero_j + 1];
+  ++result.zero_j;
+  result.puzzle[zero_i][result.zero_j] = 0;
+
+  result.determine_hash();
+
+  return result;
+}
+
+template <int N>
+N_puzzle<N> N_puzzle<N>::tile_right() const {
+  if ( !puzzle_valid ) {
+    return *this;
+  }
+
+  N_puzzle result( *this );
+
+  if ( zero_j == 0 ) {
+    result.puzzle_valid = false;
+    return result;
+  }
+
+  result.manhattan_distance +=
+    abs( ((puzzle[zero_i][zero_j - 1] - 1) % N) - zero_j ) -
+    abs( ((puzzle[zero_i][zero_j - 1] - 1) % N) - (zero_j - 1) );
+
+  result.puzzle[zero_i][zero_j] = puzzle[zero_i][zero_j - 1];
+  --result.zero_j;
+  result.puzzle[zero_i][result.zero_j] = 0;
+
+  result.determine_hash();
+
+  return result;
+}
+
+/*
+ *  Check if the puzzle is solvable:  that is, check the
+ *  number of inversions pluse the Manhattan distance of
+ *  the black from the lower-right corner.
+ *
+ *  Run time:   O(n^2)
+ *  Memory:     O(n)
+ */
+
+template <int N>
+bool N_puzzle<N>::solvable() const {
+  if ( !valid() ) {
+    return false;
+  }
+
+  int entries[N*N];
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      if ( puzzle[i][j] == 0 ) {
+        entries[N*i + j] = N*N;
+      } else {
+        entries[N*i + j] = puzzle[i][j];
+      }
+    }
+  }
+
+  int parity = 0;
+
+  for ( int i = 0; i < N*N; ++i ) {
+    for ( int j = i + 1; j < N*N; ++j ) {
+      if ( entries[i] > entries[j] ) {
+        ++parity;
+      }
+    }
+  }
+
+  parity += 2*N - 2 - zero_i - zero_j;
+
+  return ( (parity & 1) == 0 );
+}
+
+template <int N>
+bool N_puzzle<N>::valid() const {
+  return puzzle_valid;
+}
+
+/*
+ *  Return either the Manhattan, Hamming, or discrete distance
+ *  between the puzzle and the solution.
+ */
+
+template <int N>
+int N_puzzle<N>::lower_bound() const {
+  // The Manhattan distance
+  return valid() ? manhattan_distance : N*N*N;
+
+  int result = 0;
+  int count = 1;
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      if ( puzzle[i][j] != (count % N*N) ) {
+        ++result;
+      }
+
+      ++count;
+    }
+  }
+
+  // The Hamming distance, or
+  return result;
+
+  // The discrete distance:  converts the A* search to Dijkstra's algorithm
+  // return ( result == 0 ) ? 0 : 1;
+}
+
+/*
+ *  puzzle1 == puzzle2
+ *
+ *  Two puzzles are considered to be equal if their entries
+ *  are equal:
+ *    If either puzzle is not valid, return false.
+ *    If the hash values are different, they are different; return false.
+ *    Otherwise, check all entries to see if they are the same.
+ */
+
+template < int N >
+bool N_puzzle<N>::operator==( N_puzzle const &rhs ) const {
+  if ( !valid() || !rhs.valid() || hash() != rhs.hash() ) {
+    return false;
+  }
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      if ( puzzle[i][j] != rhs.puzzle[i][j] ) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+/*
+ *  puzzle1 != puzzle2
+ *
+ *  Two puzzles are considered to be unequal if any of the entries
+ *  different:
+ *    If either puzzle is not valid, return false.
+ *    If the hash values are different, they are different; return true.
+ *    Otherwise, check all entries to see if they are the same.
+ */
+
+template < int N >
+bool N_puzzle<N>::operator!=( N_puzzle const &rhs ) const {
+  if ( !valid() || !rhs.valid() ) {
+    return false;
+  }
+
+  if ( hash() != rhs.hash() ) {
+    return true;
+  }
+
+  for ( int i = 0; i < N; ++i ) {
+    for ( int j = 0; j < N; ++j ) {
+      if ( puzzle[i][j] != rhs.puzzle[i][j] ) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/*
+ * unsigned int hash() const
+ *
+ *   Returns the pre-calculated hash value.
+ */
+
+template < int N >
+unsigned int N_puzzle<N>::hash() const {
+  return valid() ? hash_value : 0;
+}
+
+/*
+ * N_puzzle<N>  solution()
+ *
+ *   Returns the correct solution to the N puzzle:
+ *
+ *       1  2  3         1   2   3   4
+ *  3x3: 4  5  6   4x4:  5   6   7   8
+ *       7  8            9  10  11  12
+ *                      13  14  15
+ */
+
+template <int N>
+N_puzzle<N> N_puzzle<N>::solution() {
+  int array[N*N];
+
+  for ( int i = 0; i < N*N - 1; ++i ) {
+    array[i] = i + 1;
+  }
+
+  array[N*N - 1] = 0;
+
+  return N_puzzle<N>( array );
+}
+
--- a/am-kernels/benchmarks/microbench/src/bench.c
+++ b/am-kernels/benchmarks/microbench/src/bench.c
@ -0,0 +1,199 @@
+#include <am.h>
+#include <benchmark.h>
+#include <limits.h>
+#include <klib-macros.h>
+
+Benchmark *current;
+Setting *setting;
+
+static char *hbrk;
+
+static uint64_t uptime() { return io_read(AM_TIMER_UPTIME).us; }
+
+static char *format_time(uint64_t us) {
+  static char buf[32];
+  uint64_t ms = us / 1000;
+  us -= ms * 1000;
+  assert(us < 1000);
+  int len = sprintf(buf, "%d.000", ms);
+  char *p = &buf[len - 1];
+  while (us > 0) {
+    *(p --) = '0' + us % 10;
+    us /= 10;
+  }
+  return buf;
+}
+
+// The benchmark list
+
+#define ENTRY(_name, _sname, _s, _m, _l, _h, _desc) \
+  { .prepare = bench_##_name##_prepare, \
+    .run = bench_##_name##_run, \
+    .validate = bench_##_name##_validate, \
+    .name = _sname, \
+    .desc = _desc, \
+    .settings = {_s, _m, _l, _h}, },
+
+Benchmark benchmarks[] = {
+  BENCHMARK_LIST(ENTRY)
+};
+
+// Running a benchmark
+static void bench_prepare(Result *res) {
+  res->usec = uptime();
+}
+
+static void bench_reset() {
+  hbrk = (void *)ROUNDUP(heap.start, 8);
+}
+
+static void bench_done(Result *res) {
+  res->usec = uptime() - res->usec;
+}
+
+static const char *bench_check(Benchmark *bench) {
+  uintptr_t freesp = (uintptr_t)heap.end - (uintptr_t)heap.start;
+  if (freesp < setting->mlim) {
+    return "(insufficient memory)";
+  }
+  return NULL;
+}
+
+static void run_once(Benchmark *b, Result *res) {
+  bench_reset();       // reset malloc state
+  current->prepare();  // call bechmark's prepare function
+  bench_prepare(res);  // clean everything, start timer
+  current->run();      // run it
+  bench_done(res);     // collect results
+  res->pass = current->validate();
+}
+
+static uint32_t score(Benchmark *b, uint64_t usec) {
+  if (usec == 0) return 0;
+  return (uint64_t)(REF_SCORE) * setting->ref / usec;
+}
+
+int main(const char *args) {
+  const char *setting_name = args;
+  if (args == NULL || strcmp(args, "") == 0) {
+    printf("Empty mainargs. Use \"ref\" by default\n");
+    setting_name = "ref";
+  }
+  int setting_id = -1;
+
+  if      (strcmp(setting_name, "test" ) == 0) setting_id = 0;
+  else if (strcmp(setting_name, "train") == 0) setting_id = 1;
+  else if (strcmp(setting_name, "ref"  ) == 0) setting_id = 2;
+  else if (strcmp(setting_name, "huge" ) == 0) setting_id = 3;
+  else {
+    printf("Invalid mainargs: \"%s\"; "
+           "must be in {test, train, ref, huge}\n", setting_name);
+    halt(1);
+  }
+
+  ioe_init();
+
+  printf("======= Running MicroBench [input *%s*] =======\n", setting_name);
+
+  uint32_t bench_score = 0;
+  int pass = 1;
+  uint64_t t0 = uptime();
+  uint64_t score_time = 0;
+
+  for (int i = 0; i < LENGTH(benchmarks); i ++) {
+    Benchmark *bench = &benchmarks[i];
+    current = bench;
+    setting = &bench->settings[setting_id];
+    const char *msg = bench_check(bench);
+    printf("[%s] %s: ", bench->name, bench->desc);
+    if (msg != NULL) {
+      printf("Ignored %s\n", msg);
+    } else {
+      uint64_t usec = ULLONG_MAX;
+      int succ = 1;
+      for (int i = 0; i < REPEAT; i ++) {
+        Result res;
+        run_once(bench, &res);
+        printf(res.pass ? "*" : "X");
+        succ &= res.pass;
+        if (res.usec < usec) usec = res.usec;
+        score_time += res.usec;
+      }
+
+      if (succ) printf(" Passed.");
+      else printf(" Failed.");
+
+      pass &= succ;
+
+      uint32_t cur = succ ? score(bench, usec) : 0;
+
+      printf("\n");
+      if (setting_id != 0) {
+        printf("  min time: %s ms [%d]\n", format_time(usec), cur);
+      }
+
+      bench_score += cur;
+    }
+  }
+  uint64_t total_time = uptime() - t0;
+
+  bench_score /= LENGTH(benchmarks);
+
+  printf("==================================================\n");
+  printf("MicroBench %s", pass ? "PASS" : "FAIL");
+  if (setting_id >= 2) {
+    printf("        %d Marks\n", bench_score);
+    printf("                   vs. %d Marks (%s)\n", REF_SCORE, REF_CPU);
+  } else {
+    printf("\n");
+  }
+  printf("Scored time: %s ms\n", format_time(score_time));
+  printf("Total  time: %s ms\n", format_time(total_time));
+  return (pass ? 0 : 1);
+}
+
+// Libraries
+
+void* bench_alloc(size_t size) {
+  size  = (size_t)ROUNDUP(size, 8);
+  char *old = hbrk;
+  hbrk += size;
+  assert((uintptr_t)heap.start <= (uintptr_t)hbrk && (uintptr_t)hbrk < (uintptr_t)heap.end);
+  for (uint64_t *p = (uint64_t *)old; p != (uint64_t *)hbrk; p ++) {
+    *p = 0;
+  }
+  assert((uintptr_t)hbrk - (uintptr_t)heap.start <= setting->mlim);
+  return old;
+}
+
+void bench_free(void *ptr) {
+}
+
+static uint32_t seed = 1;
+
+void bench_srand(uint32_t _seed) {
+  seed = _seed & 0x7fff;
+}
+
+uint32_t bench_rand() {
+  seed = (seed * (uint32_t)214013L + (uint32_t)2531011L);
+  return (seed >> 16) & 0x7fff;
+}
+
+// FNV hash
+uint32_t checksum(void *start, void *end) {
+  const uint32_t x = 16777619;
+  uint32_t h1 = 2166136261u;
+  for (uint8_t *p = (uint8_t*)start; p + 4 < (uint8_t*)end; p += 4) {
+    for (int i = 0; i < 4; i ++) {
+      h1 = (h1 ^ p[i]) * x;
+    }
+  }
+  int32_t hash = (uint32_t)h1;
+  hash += hash << 13;
+  hash ^= hash >> 7;
+  hash += hash << 3;
+  hash ^= hash >> 17;
+  hash += hash << 5;
+  return hash;
+}
--- a/am-kernels/benchmarks/microbench/src/bf/bf.c
+++ b/am-kernels/benchmarks/microbench/src/bf/bf.c
@ -0,0 +1,151 @@
+/*
+ Brainfuck-C ( http://github.com/kgabis/brainfuck-c )
+ Copyright (c) 2012 Krzysztof Gabis
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+*/
+
+#include <benchmark.h>
+
+static int ARR_SIZE;
+
+#define CODE            ">>+>>>>>,[>+>>,]>+[--[+<<<-]<[<+>-]<[<[->[<<<+>>>>+<-]<<[>>+>[->]<<[<]" \
+                        "<-]>]>>>+<[[-]<[>+<-]<]>[[>>>]+<<<-<[<<[<<<]>>+>[>>>]<-]<<[<<<]>[>>[>>" \
+                        ">]<+<<[<<<]>-]]+<<<]+[->>>]>>]>>[.>>>]"
+
+#define OP_END          0
+#define OP_INC_DP       1
+#define OP_DEC_DP       2
+#define OP_INC_VAL      3
+#define OP_DEC_VAL      4
+#define OP_OUT          5
+#define OP_IN           6
+#define OP_JMP_FWD      7
+#define OP_JMP_BCK      8
+
+#define SUCCESS         0
+#define FAILURE         1
+
+#define PROGRAM_SIZE    4096
+#define STACK_SIZE      512
+#define DATA_SIZE       4096
+
+#define STACK_PUSH(A)   (STACK[SP++] = A)
+#define STACK_POP()     (STACK[--SP])
+#define STACK_EMPTY()   (SP == 0)
+#define STACK_FULL()    (SP == STACK_SIZE)
+
+struct instruction_t {
+  unsigned short operator;
+  unsigned short operand;
+};
+
+static struct instruction_t *PROGRAM;
+static unsigned short *STACK;
+static unsigned int SP;
+static const char *code;
+static char *input;
+
+static int compile_bf() {
+  unsigned short pc = 0, jmp_pc;
+  for (; *code; code ++) {
+    int c = *code;
+    if (pc >= PROGRAM_SIZE) break;
+    switch (c) {
+      case '>': PROGRAM[pc].operator = OP_INC_DP; break;
+      case '<': PROGRAM[pc].operator = OP_DEC_DP; break;
+      case '+': PROGRAM[pc].operator = OP_INC_VAL; break;
+      case '-': PROGRAM[pc].operator = OP_DEC_VAL; break;
+      case '.': PROGRAM[pc].operator = OP_OUT; break;
+      case ',': PROGRAM[pc].operator = OP_IN; break;
+      case '[':
+        PROGRAM[pc].operator = OP_JMP_FWD;
+        if (STACK_FULL()) {
+          return FAILURE;
+        }
+        STACK_PUSH(pc);
+        break;
+      case ']':
+        if (STACK_EMPTY()) {
+          return FAILURE;
+        }
+        jmp_pc = STACK_POP();
+        PROGRAM[pc].operator = OP_JMP_BCK;
+        PROGRAM[pc].operand = jmp_pc;
+        PROGRAM[jmp_pc].operand = pc;
+        break;
+      default: pc--; break;
+    }
+    pc++;
+  }
+  if (!STACK_EMPTY() || pc == PROGRAM_SIZE) {
+    return FAILURE;
+  }
+  PROGRAM[pc].operator = OP_END;
+  return SUCCESS;
+}
+
+static unsigned short *data;
+static char *output;
+static int noutput;
+
+static void execute_bf() {
+  unsigned int pc = 0, ptr = 0;
+  while (PROGRAM[pc].operator != OP_END && ptr < DATA_SIZE) {
+    switch (PROGRAM[pc].operator) {
+      case OP_INC_DP: ptr++; break;
+      case OP_DEC_DP: ptr--; break;
+      case OP_INC_VAL: data[ptr]++; break;
+      case OP_DEC_VAL: data[ptr]--; break;
+      case OP_OUT: output[noutput ++] = data[ptr]; break;
+      case OP_IN: data[ptr] = *(input ++); break;
+      case OP_JMP_FWD: if(!data[ptr]) { pc = PROGRAM[pc].operand; } break;
+      case OP_JMP_BCK: if(data[ptr]) { pc = PROGRAM[pc].operand; } break;
+      default: return;
+    }
+    pc++;
+  }
+}
+
+void bench_bf_prepare() {
+  ARR_SIZE = setting->size;
+  SP = 0;
+  PROGRAM = bench_alloc(sizeof(PROGRAM[0]) * PROGRAM_SIZE);
+  STACK = bench_alloc(sizeof(STACK[0]) * STACK_SIZE);
+  data = bench_alloc(sizeof(data[0]) * DATA_SIZE);
+  code = CODE;
+  input = bench_alloc(ARR_SIZE + 1);
+  output = bench_alloc(DATA_SIZE);
+  noutput = 0;
+
+  bench_srand(1);
+  for (int i = 0; i < ARR_SIZE; i ++) {
+    input[i] = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"[bench_rand() % 62];
+  }
+}
+
+void bench_bf_run() {
+  compile_bf();
+  execute_bf();
+}
+
+int bench_bf_validate() {
+  uint32_t cs = checksum(output, output + noutput);
+  return noutput == ARR_SIZE && cs == setting->checksum;
+}
--- a/am-kernels/benchmarks/microbench/src/dinic/dinic.cc
+++ b/am-kernels/benchmarks/microbench/src/dinic/dinic.cc
@ -0,0 +1,138 @@
+#include <benchmark.h>
+
+static int N;
+const int INF = 0x3f3f3f;
+
+struct Edge {
+  int from, to, cap, flow;
+  Edge(){}
+  Edge(int from, int to, int cap, int flow) {
+    this->from = from;
+    this->to = to;
+    this->cap = cap;
+    this->flow = flow;
+  }
+};
+
+template<typename T>
+static inline T min(T x, T y) {
+  return x < y ? x : y;
+}
+
+struct Dinic {
+  int n, m, s, t;
+  Edge *edges;
+  int *head, *nxt, *d, *cur, *queue;
+  bool *vis;
+
+  void init(int n) {
+    int nold = (n - 2) / 2;
+    int maxm = (nold * nold + nold * 2) * 2;
+
+    edges = (Edge *)bench_alloc(sizeof(Edge) * maxm);
+    head = (int *)bench_alloc(sizeof(int) * n);
+    nxt = (int *)bench_alloc(sizeof(int) * maxm);
+    vis = (bool *)bench_alloc(sizeof(bool) * n);
+    d = (int *)bench_alloc(sizeof(int) * n);
+    cur = (int *)bench_alloc(sizeof(int) * n);
+    queue = (int *)bench_alloc(sizeof(int) * n);
+
+    this->n = n;
+    for (int i = 0; i < n; i ++) {
+      head[i] = -1;
+    }
+    m = 0;
+  }
+
+  void AddEdge(int u, int v, int c) {
+    if (c == 0) return;
+    edges[m] = Edge(u, v, c, 0);
+    nxt[m] = head[u];
+    head[u] = m++;
+    edges[m] = Edge(v, u, 0, 0);
+    nxt[m] = head[v];
+    head[v] = m++;
+  }
+
+  bool BFS() {
+    for (int i = 0; i < n; i ++) vis[i] = 0;
+    int qf = 0, qr = 0;
+    queue[qr ++] = s;
+    d[s] = 0;
+    vis[s] = 1;
+    while (qf != qr) {
+      int x = queue[qf ++];
+      for (int i = head[x]; i != -1; i = nxt[i]) {
+        Edge& e = edges[i];
+        if (!vis[e.to] && e.cap > e.flow) {
+          vis[e.to] = 1;
+          d[e.to] = d[x] + 1;
+          queue[qr ++] = e.to;
+        }
+      }
+    }
+    return vis[t];
+  }
+
+  int DFS(int x, int a) {
+    if (x == t || a == 0) return a;
+    int flow = 0, f;
+    for (int i = cur[x]; i != -1; i = nxt[i]) {
+      Edge& e = edges[i];
+      if (d[x] + 1 == d[e.to] && (f = DFS(e.to, min(a, e.cap-e.flow))) > 0) {
+        e.flow += f;
+        edges[i^1].flow -= f;
+        flow += f;
+        a -= f;
+        if (a == 0) break;
+      }
+    }
+    return flow;
+  }
+
+  int Maxflow(int s, int t) {
+    this -> s = s; this -> t = t;
+    int flow = 0;
+    while (BFS()) {
+      for (int i = 0; i < n; i++)
+        cur[i] = head[i];
+      flow += DFS(s, INF);
+    }
+    return flow;
+  }
+};
+
+
+extern "C" {
+
+
+static Dinic *G;
+static int ans;
+
+void bench_dinic_prepare() {
+  N = setting->size;
+  bench_srand(1);
+  int s = 2 * N, t = 2 * N + 1;
+  G = (Dinic*)bench_alloc(sizeof(Dinic));
+  G->init(2 * N + 2);
+  for (int i = 0; i < N; i ++)
+    for (int j = 0; j < N; j ++) {
+      G->AddEdge(i, N + j, bench_rand() % 10);
+    }
+
+  for (int i = 0; i < N; i ++) {
+    G->AddEdge(s, i, bench_rand() % 1000);
+    G->AddEdge(N + i, t, bench_rand() % 1000);
+  }
+}
+
+void bench_dinic_run() {
+  ans = G->Maxflow(2 * N, 2 * N + 1);
+}
+
+int bench_dinic_validate() {
+  return (uint32_t)ans == setting->checksum;
+}
+}
+
+
--- a/am-kernels/benchmarks/microbench/src/fib/fib.c
+++ b/am-kernels/benchmarks/microbench/src/fib/fib.c
@ -0,0 +1,64 @@
+#include <benchmark.h>
+
+// f(n) = (f(n-1) + f(n-2) + .. f(n-m)) mod 2^32
+
+#define N 2147483603
+static int M;
+
+static void put(uint32_t *m, int i, int j, uint32_t data) {
+  m[i * M + j] = data;
+}
+
+static uint32_t get(uint32_t *m, int i, int j) {
+  return m[i * M + j];
+}
+
+static inline void mult(uint32_t *c, uint32_t *a, uint32_t *b) {
+  for (int i = 0; i < M; i ++)
+    for (int j = 0; j < M; j ++) {
+      put(c, i, j, 0);
+      for (int k = 0; k < M; k ++) {
+        put(c, i, j, get(c, i, j) + get(a, i, k) * get(b, k, j));
+      }
+    }
+}
+
+static inline void assign(uint32_t *a, uint32_t *b) {
+  for (int i = 0; i < M; i ++)
+    for (int j = 0; j < M; j ++)
+      put(a, i, j, get(b, i, j));
+}
+
+static uint32_t *A, *ans, *T, *tmp;
+
+void bench_fib_prepare() {
+  M = setting->size;
+  int sz = sizeof(uint32_t) * M * M;
+  A = bench_alloc(sz);
+  T = bench_alloc(sz);
+  ans = bench_alloc(sz);
+  tmp = bench_alloc(sz);
+}
+
+void bench_fib_run() {
+  for (int i = 0; i < M; i ++)
+    for (int j = 0; j < M; j ++) {
+      uint32_t x = (i == M - 1 || j == i + 1);
+      put(A, i, j, x);
+      put(T, i, j, x);
+      put(ans, i, j, i == j);
+    }
+
+  for (int n = N; n > 0; n >>= 1) {
+    if (n & 1) {
+      mult(tmp, ans, T);
+      assign(ans, tmp);
+    }
+    mult(tmp, T, T);
+    assign(T, tmp);
+  }
+}
+
+int bench_fib_validate() {
+  return get(ans, M-1, M-1) == setting->checksum;
+}
--- a/am-kernels/benchmarks/microbench/src/lzip/lzip.c
+++ b/am-kernels/benchmarks/microbench/src/lzip/lzip.c
@ -0,0 +1,29 @@
+#include "quicklz.h"
+#include <benchmark.h>
+
+static int SIZE;
+
+static qlz_state_compress *state;
+static char *blk;
+static char *compress;
+static int len;
+
+void bench_lzip_prepare() {
+  SIZE = setting->size;
+  bench_srand(1);
+  state = bench_alloc(sizeof(qlz_state_compress));
+  blk = bench_alloc(SIZE);
+  compress = bench_alloc(SIZE + 400);
+  for (int i = 0; i < SIZE; i ++) {
+    blk[i] = 'a' + bench_rand() % 26;
+  }
+}
+
+void bench_lzip_run() {
+  len = qlz_compress(blk, compress, SIZE, state);
+}
+
+int bench_lzip_validate() {
+  return checksum(compress, compress + len) == setting->checksum;
+}
+
--- a/am-kernels/benchmarks/microbench/src/lzip/quicklz.c
+++ b/am-kernels/benchmarks/microbench/src/lzip/quicklz.c
@ -0,0 +1,761 @@
+// Fast data compression library
+// Copyright (C) 2006-2011 Lasse Mikkel Reinhold
+// lar@quicklz.com
+//
+// QuickLZ can be used for free under the GPL 1, 2 or 3 license (where anything
+// released into public must be open source) or under a commercial license if such
+// has been acquired (see http://www.quicklz.com/order.html). The commercial license
+// does not cover derived or ported versions created by third parties under GPL.
+
+// 1.5.0 final
+
+#include "quicklz.h"
+
+#if QLZ_VERSION_MAJOR != 1 || QLZ_VERSION_MINOR != 5 || QLZ_VERSION_REVISION != 0
+	#error quicklz.c and quicklz.h have different versions
+#endif
+
+#define MINOFFSET 2
+#define UNCONDITIONAL_MATCHLEN 6
+#define UNCOMPRESSED_END 4
+#define CWORD_LEN 4
+
+#if QLZ_COMPRESSION_LEVEL == 1 && defined QLZ_PTR_64 && QLZ_STREAMING_BUFFER == 0
+	#define OFFSET_BASE source
+	#define CAST (ui32)(size_t)
+#else
+	#define OFFSET_BASE 0
+	#define CAST
+#endif
+
+int qlz_get_setting(int setting)
+{
+	switch (setting)
+	{
+		case 0: return QLZ_COMPRESSION_LEVEL;
+		case 1: return sizeof(qlz_state_compress);
+		case 2: return sizeof(qlz_state_decompress);
+		case 3: return QLZ_STREAMING_BUFFER;
+#ifdef QLZ_MEMORY_SAFE
+		case 6: return 1;
+#else
+		case 6: return 0;
+#endif
+		case 7: return QLZ_VERSION_MAJOR;
+		case 8: return QLZ_VERSION_MINOR;
+		case 9: return QLZ_VERSION_REVISION;
+	}
+	return -1;
+}
+
+#if QLZ_COMPRESSION_LEVEL == 1
+static int same(const unsigned char *src, size_t n)
+{
+	while(n > 0 && *(src + n) == *src)
+		n--;
+	return n == 0 ? 1 : 0;
+}
+#endif
+
+static void reset_table_compress(qlz_state_compress *state)
+{
+	int i;
+	for(i = 0; i < QLZ_HASH_VALUES; i++)
+	{
+#if QLZ_COMPRESSION_LEVEL == 1
+		state->hash[i].offset = 0;
+#else
+		state->hash_counter[i] = 0;
+#endif
+	}
+}
+
+static void reset_table_decompress(qlz_state_decompress *state)
+{
+	int i;
+	(void)state;
+	(void)i;
+#if QLZ_COMPRESSION_LEVEL == 2
+	for(i = 0; i < QLZ_HASH_VALUES; i++)
+	{
+		state->hash_counter[i] = 0;
+	}
+#endif
+}
+
+static __inline ui32 hash_func(ui32 i)
+{
+#if QLZ_COMPRESSION_LEVEL == 2
+	return ((i >> 9) ^ (i >> 13) ^ i) & (QLZ_HASH_VALUES - 1);
+#else
+	return ((i >> 12) ^ i) & (QLZ_HASH_VALUES - 1);
+#endif
+}
+
+static __inline ui32 fast_read(void const *src, ui32 bytes)
+{
+  uint32_t ret = 0;
+	if (bytes >= 1 && bytes <= 4) {
+    for (uint32_t i = 0; i < bytes; i ++) {
+      ret |= ((uint8_t*)src)[i] << (i * 8);
+    }
+  }
+  return ret;
+}
+
+static __inline ui32 hashat(const unsigned char *src)
+{
+	ui32 fetch, hash;
+	fetch = fast_read(src, 3);
+	hash = hash_func(fetch);
+	return hash;
+}
+
+static __inline void fast_write(ui32 f, void *dst, size_t bytes)
+{
+  for (size_t i = 0; i != bytes; i ++) {
+    ((char*)dst)[i] = ((char*)&f)[i];
+  }
+}
+
+
+size_t qlz_size_decompressed(const char *source)
+{
+	ui32 n, r;
+	n = (((*source) & 2) == 2) ? 4 : 1;
+	r = fast_read(source + 1 + n, n);
+	r = r & (0xffffffff >> ((4 - n)*8));
+	return r;
+}
+
+size_t qlz_size_compressed(const char *source)
+{
+	ui32 n, r;
+	n = (((*source) & 2) == 2) ? 4 : 1;
+	r = fast_read(source + 1, n);
+	r = r & (0xffffffff >> ((4 - n)*8));
+	return r;
+}
+
+size_t qlz_size_header(const char *source)
+{
+	size_t n = 2*((((*source) & 2) == 2) ? 4 : 1) + 1;
+	return n;
+}
+
+
+static __inline void memcpy_up(unsigned char *dst, const unsigned char *src, ui32 n)
+{
+  assert(0); // unaligned memory access
+}
+
+static __inline void update_hash(qlz_state_decompress *state, const unsigned char *s)
+{
+#if QLZ_COMPRESSION_LEVEL == 1
+	ui32 hash;
+	hash = hashat(s);
+	state->hash[hash].offset = s;
+	state->hash_counter[hash] = 1;
+#elif QLZ_COMPRESSION_LEVEL == 2
+	ui32 hash;
+	unsigned char c;
+	hash = hashat(s);
+	c = state->hash_counter[hash];
+	state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = s;
+	c++;
+	state->hash_counter[hash] = c;
+#endif
+	(void)state;
+	(void)s;
+}
+
+#if QLZ_COMPRESSION_LEVEL <= 2
+static void update_hash_upto(qlz_state_decompress *state, unsigned char **lh, const unsigned char *max)
+{
+	while(*lh < max)
+	{
+		(*lh)++;
+		update_hash(state, *lh);
+	}
+}
+#endif
+
+static size_t qlz_compress_core(const unsigned char *source, unsigned char *destination, size_t size, qlz_state_compress *state)
+{
+	const unsigned char *last_byte = source + size - 1;
+	const unsigned char *src = source;
+	unsigned char *cword_ptr = destination;
+	unsigned char *dst = destination + CWORD_LEN;
+	ui32 cword_val = 1U << 31;
+	const unsigned char *last_matchstart = last_byte - UNCONDITIONAL_MATCHLEN - UNCOMPRESSED_END;
+	ui32 fetch = 0;
+	unsigned int lits = 0;
+
+	(void) lits;
+
+	if(src <= last_matchstart)
+		fetch = fast_read(src, 3);
+
+	while(src <= last_matchstart)
+	{
+		if ((cword_val & 1) == 1)
+		{
+			// store uncompressed if compression ratio is too low
+			if (src > source + (size >> 1) && dst - destination > src - source - ((src - source) >> 5))
+				return 0;
+
+			fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
+
+			cword_ptr = dst;
+			dst += CWORD_LEN;
+			cword_val = 1U << 31;
+			fetch = fast_read(src, 3);
+		}
+#if QLZ_COMPRESSION_LEVEL == 1
+		{
+			const unsigned char *o;
+			ui32 hash, cached;
+
+			hash = hash_func(fetch);
+			cached = fetch ^ state->hash[hash].cache;
+			state->hash[hash].cache = fetch;
+
+			o = state->hash[hash].offset + OFFSET_BASE;
+			state->hash[hash].offset = CAST(src - OFFSET_BASE);
+
+			if (cached == 0 && o != OFFSET_BASE && (src - o > MINOFFSET || (src == o + 1 && lits >= 3 && src > source + 3 && same(src - 3, 6))))
+			{
+				if (*(o + 3) != *(src + 3))
+				{
+					hash <<= 4;
+					cword_val = (cword_val >> 1) | (1U << 31);
+					fast_write((3 - 2) | hash, dst, 2);
+					src += 3;
+					dst += 2;
+				}
+				else
+				{
+					const unsigned char *old_src = src;
+					size_t matchlen;
+					hash <<= 4;
+
+					cword_val = (cword_val >> 1) | (1U << 31);
+					src += 4;
+
+					if(*(o + (src - old_src)) == *src)
+					{
+						src++;
+						if(*(o + (src - old_src)) == *src)
+						{
+							size_t q = last_byte - UNCOMPRESSED_END - (src - 5) + 1;
+							size_t remaining = q > 255 ? 255 : q;
+							src++;
+							while(*(o + (src - old_src)) == *src && (size_t)(src - old_src) < remaining)
+								src++;
+						}
+					}
+
+					matchlen = src - old_src;
+					if (matchlen < 18)
+					{
+						fast_write((ui32)(matchlen - 2) | hash, dst, 2);
+						dst += 2;
+					}
+					else
+					{
+						fast_write((ui32)(matchlen << 16) | hash, dst, 3);
+						dst += 3;
+					}
+				}
+				fetch = fast_read(src, 3);
+				lits = 0;
+			}
+			else
+			{
+				lits++;
+				*dst = *src;
+				src++;
+				dst++;
+				cword_val = (cword_val >> 1);
+				fetch = (fetch >> 8 & 0xffff) | (*(src + 2) << 16);
+			}
+		}
+#elif QLZ_COMPRESSION_LEVEL >= 2
+		{
+			const unsigned char *o, *offset2;
+			ui32 hash, matchlen, k, m, best_k = 0;
+			unsigned char c;
+			size_t remaining = (last_byte - UNCOMPRESSED_END - src + 1) > 255 ? 255 : (last_byte - UNCOMPRESSED_END - src + 1);
+			(void)best_k;
+
+
+			//hash = hashat(src);
+			fetch = fast_read(src, 3);
+			hash = hash_func(fetch);
+
+			c = state->hash_counter[hash];
+
+			offset2 = state->hash[hash].offset[0];
+			if(offset2 < src - MINOFFSET && c > 0 && ((fast_read(offset2, 3) ^ fetch) & 0xffffff) == 0)
+			{
+				matchlen = 3;
+				if(*(offset2 + matchlen) == *(src + matchlen))
+				{
+					matchlen = 4;
+					while(*(offset2 + matchlen) == *(src + matchlen) && matchlen < remaining)
+						matchlen++;
+				}
+			}
+			else
+				matchlen = 0;
+			for(k = 1; k < QLZ_POINTERS && c > k; k++)
+			{
+				o = state->hash[hash].offset[k];
+#if QLZ_COMPRESSION_LEVEL == 3
+				if(((fast_read(o, 3) ^ fetch) & 0xffffff) == 0 && o < src - MINOFFSET)
+#elif QLZ_COMPRESSION_LEVEL == 2
+				if(*(src + matchlen) == *(o + matchlen)	&& ((fast_read(o, 3) ^ fetch) & 0xffffff) == 0 && o < src - MINOFFSET)
+#endif
+				{
+					m = 3;
+					while(*(o + m) == *(src + m) && m < remaining)
+						m++;
+#if QLZ_COMPRESSION_LEVEL == 3
+					if ((m > matchlen) || (m == matchlen && o > offset2))
+#elif QLZ_COMPRESSION_LEVEL == 2
+					if (m > matchlen)
+#endif
+					{
+						offset2 = o;
+						matchlen = m;
+						best_k = k;
+					}
+				}
+			}
+			o = offset2;
+			state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src;
+			c++;
+			state->hash_counter[hash] = c;
+
+#if QLZ_COMPRESSION_LEVEL == 3
+			if(matchlen > 2 && src - o < 131071)
+			{
+				ui32 u;
+				size_t offset = src - o;
+
+				for(u = 1; u < matchlen; u++)
+				{
+					hash = hashat(src + u);
+					c = state->hash_counter[hash]++;
+					state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src + u;
+				}
+
+				cword_val = (cword_val >> 1) | (1U << 31);
+				src += matchlen;
+
+				if(matchlen == 3 && offset <= 63)
+				{
+					*dst = (unsigned char)(offset << 2);
+					dst++;
+				}
+				else if (matchlen == 3 && offset <= 16383)
+				{
+					ui32 f = (ui32)((offset << 2) | 1);
+					fast_write(f, dst, 2);
+					dst += 2;
+				}
+				else if (matchlen <= 18 && offset <= 1023)
+				{
+					ui32 f = ((matchlen - 3) << 2) | ((ui32)offset << 6) | 2;
+					fast_write(f, dst, 2);
+					dst += 2;
+				}
+
+				else if(matchlen <= 33)
+				{
+					ui32 f = ((matchlen - 2) << 2) | ((ui32)offset << 7) | 3;
+					fast_write(f, dst, 3);
+					dst += 3;
+				}
+				else
+				{
+					ui32 f = ((matchlen - 3) << 7) | ((ui32)offset << 15) | 3;
+					fast_write(f, dst, 4);
+					dst += 4;
+				}
+			}
+			else
+			{
+				*dst = *src;
+				src++;
+				dst++;
+				cword_val = (cword_val >> 1);
+			}
+#elif QLZ_COMPRESSION_LEVEL == 2
+
+			if(matchlen > 2)
+			{
+				cword_val = (cword_val >> 1) | (1U << 31);
+				src += matchlen;
+
+				if (matchlen < 10)
+				{
+					ui32 f = best_k | ((matchlen - 2) << 2) | (hash << 5);
+					fast_write(f, dst, 2);
+					dst += 2;
+				}
+				else
+				{
+					ui32 f = best_k | (matchlen << 16) | (hash << 5);
+					fast_write(f, dst, 3);
+					dst += 3;
+				}
+			}
+			else
+			{
+				*dst = *src;
+				src++;
+				dst++;
+				cword_val = (cword_val >> 1);
+			}
+#endif
+		}
+#endif
+	}
+	while (src <= last_byte)
+	{
+		if ((cword_val & 1) == 1)
+		{
+			fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
+			cword_ptr = dst;
+			dst += CWORD_LEN;
+			cword_val = 1U << 31;
+		}
+#if QLZ_COMPRESSION_LEVEL < 3
+		if (src <= last_byte - 3)
+		{
+#if QLZ_COMPRESSION_LEVEL == 1
+			ui32 hash, fetch;
+			fetch = fast_read(src, 3);
+			hash = hash_func(fetch);
+			state->hash[hash].offset = CAST(src - OFFSET_BASE);
+			state->hash[hash].cache = fetch;
+#elif QLZ_COMPRESSION_LEVEL == 2
+			ui32 hash;
+			unsigned char c;
+			hash = hashat(src);
+			c = state->hash_counter[hash];
+			state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src;
+			c++;
+			state->hash_counter[hash] = c;
+#endif
+		}
+#endif
+		*dst = *src;
+		src++;
+		dst++;
+		cword_val = (cword_val >> 1);
+	}
+
+	while((cword_val & 1) != 1)
+		cword_val = (cword_val >> 1);
+
+	fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
+
+	// min. size must be 9 bytes so that the qlz_size functions can take 9 bytes as argument
+	return dst - destination < 9 ? 9 : dst - destination;
+}
+
+static size_t qlz_decompress_core(const unsigned char *source, unsigned char *destination, size_t size, qlz_state_decompress *state, const unsigned char *history)
+{
+	const unsigned char *src = source + qlz_size_header((const char *)source);
+	unsigned char *dst = destination;
+	const unsigned char *last_destination_byte = destination + size - 1;
+	ui32 cword_val = 1;
+	const unsigned char *last_matchstart = last_destination_byte - UNCONDITIONAL_MATCHLEN - UNCOMPRESSED_END;
+	unsigned char *last_hashed = destination - 1;
+	const unsigned char *last_source_byte = source + qlz_size_compressed((const char *)source) - 1;
+	static const ui32 bitlut[16] = {4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
+
+	(void) last_source_byte;
+	(void) last_hashed;
+	(void) state;
+	(void) history;
+
+	for(;;)
+	{
+		ui32 fetch;
+
+		if (cword_val == 1)
+		{
+#ifdef QLZ_MEMORY_SAFE
+			if(src + CWORD_LEN - 1 > last_source_byte)
+				return 0;
+#endif
+			cword_val = fast_read(src, CWORD_LEN);
+			src += CWORD_LEN;
+		}
+
+#ifdef QLZ_MEMORY_SAFE
+			if(src + 4 - 1 > last_source_byte)
+				return 0;
+#endif
+
+		fetch = fast_read(src, 4);
+
+		if ((cword_val & 1) == 1)
+		{
+			ui32 matchlen;
+			const unsigned char *offset2;
+
+#if QLZ_COMPRESSION_LEVEL == 1
+			ui32 hash;
+			cword_val = cword_val >> 1;
+			hash = (fetch >> 4) & 0xfff;
+			offset2 = (const unsigned char *)(size_t)state->hash[hash].offset;
+
+			if((fetch & 0xf) != 0)
+			{
+				matchlen = (fetch & 0xf) + 2;
+				src += 2;
+			}
+			else
+			{
+				matchlen = *(src + 2);
+				src += 3;
+			}
+
+#elif QLZ_COMPRESSION_LEVEL == 2
+			ui32 hash;
+			unsigned char c;
+			cword_val = cword_val >> 1;
+			hash = (fetch >> 5) & 0x7ff;
+			c = (unsigned char)(fetch & 0x3);
+			offset2 = state->hash[hash].offset[c];
+
+			if((fetch & (28)) != 0)
+			{
+				matchlen = ((fetch >> 2) & 0x7) + 2;
+				src += 2;
+			}
+			else
+			{
+				matchlen = *(src + 2);
+				src += 3;
+			}
+
+#elif QLZ_COMPRESSION_LEVEL == 3
+			ui32 offset;
+			cword_val = cword_val >> 1;
+			if ((fetch & 3) == 0)
+			{
+				offset = (fetch & 0xff) >> 2;
+				matchlen = 3;
+				src++;
+			}
+			else if ((fetch & 2) == 0)
+			{
+				offset = (fetch & 0xffff) >> 2;
+				matchlen = 3;
+				src += 2;
+			}
+			else if ((fetch & 1) == 0)
+			{
+				offset = (fetch & 0xffff) >> 6;
+				matchlen = ((fetch >> 2) & 15) + 3;
+				src += 2;
+			}
+			else if ((fetch & 127) != 3)
+			{
+				offset = (fetch >> 7) & 0x1ffff;
+				matchlen = ((fetch >> 2) & 0x1f) + 2;
+				src += 3;
+			}
+			else
+			{
+				offset = (fetch >> 15);
+				matchlen = ((fetch >> 7) & 255) + 3;
+				src += 4;
+			}
+
+			offset2 = dst - offset;
+#endif
+
+#ifdef QLZ_MEMORY_SAFE
+			if(offset2 < history || offset2 > dst - MINOFFSET - 1)
+				return 0;
+
+			if(matchlen > (ui32)(last_destination_byte - dst - UNCOMPRESSED_END + 1))
+				return 0;
+#endif
+
+			memcpy_up(dst, offset2, matchlen);
+			dst += matchlen;
+
+#if QLZ_COMPRESSION_LEVEL <= 2
+			update_hash_upto(state, &last_hashed, dst - matchlen);
+			last_hashed = dst - 1;
+#endif
+		}
+		else
+		{
+			if (dst < last_matchstart)
+			{
+				unsigned int n = bitlut[cword_val & 0xf];
+				memcpy_up(dst, src, 4);
+				cword_val = cword_val >> n;
+				dst += n;
+				src += n;
+#if QLZ_COMPRESSION_LEVEL <= 2
+				update_hash_upto(state, &last_hashed, dst - 3);
+#endif
+			}
+			else
+			{
+				while(dst <= last_destination_byte)
+				{
+					if (cword_val == 1)
+					{
+						src += CWORD_LEN;
+						cword_val = 1U << 31;
+					}
+#ifdef QLZ_MEMORY_SAFE
+					if(src >= last_source_byte + 1)
+						return 0;
+#endif
+					*dst = *src;
+					dst++;
+					src++;
+					cword_val = cword_val >> 1;
+				}
+
+#if QLZ_COMPRESSION_LEVEL <= 2
+				update_hash_upto(state, &last_hashed, last_destination_byte - 3); // todo, use constant
+#endif
+				return size;
+			}
+
+		}
+	}
+}
+
+size_t qlz_compress(const void *source, char *destination, size_t size, qlz_state_compress *state)
+{
+	size_t r;
+	ui32 compressed;
+	size_t base;
+
+	if(size == 0 || size > 0xffffffff - 400)
+		return 0;
+
+	if(size < 216)
+		base = 3;
+	else
+		base = 9;
+
+#if QLZ_STREAMING_BUFFER > 0
+	if (state->stream_counter + size - 1 >= QLZ_STREAMING_BUFFER)
+#endif
+	{
+		reset_table_compress(state);
+		r = base + qlz_compress_core((const unsigned char *)source, (unsigned char*)destination + base, size, state);
+#if QLZ_STREAMING_BUFFER > 0
+		reset_table_compress(state);
+#endif
+		if(r == base)
+		{
+			bench_memcpy(destination + base, source, size);
+			r = size + base;
+			compressed = 0;
+		}
+		else
+		{
+			compressed = 1;
+		}
+		state->stream_counter = 0;
+	}
+#if QLZ_STREAMING_BUFFER > 0
+	else
+	{
+		unsigned char *src = state->stream_buffer + state->stream_counter;
+
+		bench_memcpy(src, source, size);
+		r = base + qlz_compress_core(src, (unsigned char*)destination + base, size, state);
+
+ 		if(r == base)
+		{
+			bench_memcpy(destination + base, src, size);
+			r = size + base;
+			compressed = 0;
+			reset_table_compress(state);
+		}
+		else
+		{
+			compressed = 1;
+		}
+		state->stream_counter += size;
+	}
+#endif
+	if(base == 3)
+	{
+		*destination = (unsigned char)(0 | compressed);
+		*(destination + 1) = (unsigned char)r;
+		*(destination + 2) = (unsigned char)size;
+	}
+	else
+	{
+		*destination = (unsigned char)(2 | compressed);
+		fast_write((ui32)r, destination + 1, 4);
+		fast_write((ui32)size, destination + 5, 4);
+	}
+
+	*destination |= (QLZ_COMPRESSION_LEVEL << 2);
+	*destination |= (1 << 6);
+	*destination |= ((QLZ_STREAMING_BUFFER == 0 ? 0 : (QLZ_STREAMING_BUFFER == 100000 ? 1 : (QLZ_STREAMING_BUFFER == 1000000 ? 2 : 3))) << 4);
+
+// 76543210
+// 01SSLLHC
+
+	return r;
+}
+
+size_t qlz_decompress(const char *source, void *destination, qlz_state_decompress *state)
+{
+	size_t dsiz = qlz_size_decompressed(source);
+
+#if QLZ_STREAMING_BUFFER > 0
+	if (state->stream_counter + qlz_size_decompressed(source) - 1 >= QLZ_STREAMING_BUFFER)
+#endif
+	{
+		if((*source & 1) == 1)
+		{
+			reset_table_decompress(state);
+			dsiz = qlz_decompress_core((const unsigned char *)source, (unsigned char *)destination, dsiz, state, (const unsigned char *)destination);
+		}
+		else
+		{
+			bench_memcpy(destination, source + qlz_size_header(source), dsiz);
+		}
+		state->stream_counter = 0;
+		reset_table_decompress(state);
+	}
+#if QLZ_STREAMING_BUFFER > 0
+	else
+	{
+		unsigned char *dst = state->stream_buffer + state->stream_counter;
+		if((*source & 1) == 1)
+		{
+			dsiz = qlz_decompress_core((const unsigned char *)source, dst, dsiz, state, (const unsigned char *)state->stream_buffer);
+		}
+		else
+		{
+			bench_memcpy(dst, source + qlz_size_header(source), dsiz);
+			reset_table_decompress(state);
+		}
+		bench_memcpy(destination, dst, dsiz);
+		state->stream_counter += dsiz;
+	}
+#endif
+	return dsiz;
+}
+
--- a/am-kernels/benchmarks/microbench/src/lzip/quicklz.h
+++ b/am-kernels/benchmarks/microbench/src/lzip/quicklz.h
@ -0,0 +1,164 @@
+#ifndef QLZ_HEADER
+#define QLZ_HEADER
+
+#include <am.h>
+#include <klib.h>
+
+static inline void* bench_memcpy(void* dst, const void* src, size_t n){
+  assert(dst&&src);
+  const char* s;
+  char* d;
+  if(src+n>dst&&src<dst){
+    s=src+n;
+    d=dst+n;
+    while(n-->0)*--d=*--s;
+  }
+  else{
+    s=src;
+    d=dst;
+    while(n-->0)*d++=*s++;
+  }
+  return dst;
+}
+
+
+// Fast data compression library
+// Copyright (C) 2006-2011 Lasse Mikkel Reinhold
+// lar@quicklz.com
+//
+// QuickLZ can be used for free under the GPL 1, 2 or 3 license (where anything
+// released into public must be open source) or under a commercial license if such
+// has been acquired (see http://www.quicklz.com/order.html). The commercial license
+// does not cover derived or ported versions created by third parties under GPL.
+
+// You can edit following user settings. Data must be decompressed with the same
+// setting of QLZ_COMPRESSION_LEVEL and QLZ_STREAMING_BUFFER as it was compressed
+// (see manual). If QLZ_STREAMING_BUFFER > 0, scratch buffers must be initially
+// zeroed out (see manual). First #ifndef makes it possible to define settings from
+// the outside like the compiler command line.
+
+// 1.5.0 final
+
+#ifndef QLZ_COMPRESSION_LEVEL
+
+	// 1 gives fastest compression speed. 3 gives fastest decompression speed and best
+	// compression ratio.
+	//#define QLZ_COMPRESSION_LEVEL 1
+	//#define QLZ_COMPRESSION_LEVEL 2
+	//#define QLZ_COMPRESSION_LEVEL 3
+	#define QLZ_COMPRESSION_LEVEL 2
+
+	// If > 0, zero out both states prior to first call to qlz_compress() or qlz_decompress()
+	// and decompress packets in the same order as they were compressed
+	#define QLZ_STREAMING_BUFFER 0
+	//#define QLZ_STREAMING_BUFFER 100000
+	//#define QLZ_STREAMING_BUFFER 1000000
+
+	// Guarantees that decompression of corrupted data cannot crash. Decreases decompression
+	// speed 10-20%. Compression speed not affected.
+	//#define QLZ_MEMORY_SAFE
+#endif
+
+#define QLZ_VERSION_MAJOR 1
+#define QLZ_VERSION_MINOR 5
+#define QLZ_VERSION_REVISION 0
+
+// Verify compression level
+#if QLZ_COMPRESSION_LEVEL != 1 && QLZ_COMPRESSION_LEVEL != 2 && QLZ_COMPRESSION_LEVEL != 3
+#error QLZ_COMPRESSION_LEVEL must be 1, 2 or 3
+#endif
+
+typedef unsigned int ui32;
+typedef unsigned short int ui16;
+
+// Decrease QLZ_POINTERS for level 3 to increase compression speed. Do not touch any other values!
+#if QLZ_COMPRESSION_LEVEL == 1
+#define QLZ_POINTERS 1
+#define QLZ_HASH_VALUES 4096
+#elif QLZ_COMPRESSION_LEVEL == 2
+#define QLZ_POINTERS 4
+#define QLZ_HASH_VALUES 2048
+#elif QLZ_COMPRESSION_LEVEL == 3
+#define QLZ_POINTERS 16
+#define QLZ_HASH_VALUES 4096
+#endif
+
+// hash entry
+typedef struct
+{
+#if QLZ_COMPRESSION_LEVEL == 1
+	ui32 cache;
+#if defined QLZ_PTR_64 && QLZ_STREAMING_BUFFER == 0
+	unsigned int offset;
+#else
+	const unsigned char *offset;
+#endif
+#else
+	const unsigned char *offset[QLZ_POINTERS];
+#endif
+
+} qlz_hash_compress;
+
+typedef struct
+{
+#if QLZ_COMPRESSION_LEVEL == 1
+	const unsigned char *offset;
+#else
+	const unsigned char *offset[QLZ_POINTERS];
+#endif
+} qlz_hash_decompress;
+
+
+// states
+typedef struct
+{
+	#if QLZ_STREAMING_BUFFER > 0
+		unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
+	#endif
+	size_t stream_counter;
+	qlz_hash_compress hash[QLZ_HASH_VALUES];
+	unsigned char hash_counter[QLZ_HASH_VALUES];
+} qlz_state_compress;
+
+
+#if QLZ_COMPRESSION_LEVEL == 1 || QLZ_COMPRESSION_LEVEL == 2
+	typedef struct
+	{
+#if QLZ_STREAMING_BUFFER > 0
+		unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
+#endif
+		qlz_hash_decompress hash[QLZ_HASH_VALUES];
+		unsigned char hash_counter[QLZ_HASH_VALUES];
+		size_t stream_counter;
+	} qlz_state_decompress;
+#elif QLZ_COMPRESSION_LEVEL == 3
+	typedef struct
+	{
+#if QLZ_STREAMING_BUFFER > 0
+		unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
+#endif
+#if QLZ_COMPRESSION_LEVEL <= 2
+		qlz_hash_decompress hash[QLZ_HASH_VALUES];
+#endif
+		size_t stream_counter;
+	} qlz_state_decompress;
+#endif
+
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+// Public functions of QuickLZ
+size_t qlz_size_decompressed(const char *source);
+size_t qlz_size_compressed(const char *source);
+size_t qlz_compress(const void *source, char *destination, size_t size, qlz_state_compress *state);
+size_t qlz_decompress(const char *source, void *destination, qlz_state_decompress *state);
+int qlz_get_setting(int setting);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif
+
--- a/am-kernels/benchmarks/microbench/src/md5/md5.c
+++ b/am-kernels/benchmarks/microbench/src/md5/md5.c
@ -0,0 +1,159 @@
+/*
+ * Simple MD5 implementation (github.com/pod32g/md5)
+ *
+ */
+
+#include <benchmark.h>
+
+static int N;
+
+// Constants are the integer part of the sines of integers (in radians) * 2^32.
+const uint32_t k[64] = {
+0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee ,
+0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 ,
+0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be ,
+0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 ,
+0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa ,
+0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 ,
+0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed ,
+0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a ,
+0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c ,
+0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 ,
+0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 ,
+0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 ,
+0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 ,
+0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 ,
+0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 ,
+0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 };
+
+// r specifies the per-round shift amounts
+static const uint32_t r[] = {7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
+                 5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20, 5,  9, 14, 20,
+                 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
+                 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21};
+
+// leftrotate function definition
+#define LEFTROTATE(x, c) (((x) << (c)) | ((x) >> (32 - (c))))
+
+static void to_bytes(uint32_t val, uint8_t *bytes)
+{
+    bytes[0] = (uint8_t) val;
+    bytes[1] = (uint8_t) (val >> 8);
+    bytes[2] = (uint8_t) (val >> 16);
+    bytes[3] = (uint8_t) (val >> 24);
+}
+
+static uint32_t to_int32(const uint8_t *bytes)
+{
+    return (uint32_t) bytes[0]
+        | ((uint32_t) bytes[1] << 8)
+        | ((uint32_t) bytes[2] << 16)
+        | ((uint32_t) bytes[3] << 24);
+}
+
+static void md5(uint8_t *msg, size_t initial_len, uint8_t *digest) {
+
+    // These vars will contain the hash
+    uint32_t h0, h1, h2, h3;
+
+    size_t new_len, offset;
+    uint32_t w[16];
+    uint32_t a, b, c, d, i, f, g, temp;
+
+    // Initialize variables - simple count in nibbles:
+    h0 = 0x67452301;
+    h1 = 0xefcdab89;
+    h2 = 0x98badcfe;
+    h3 = 0x10325476;
+
+    //Pre-processing:
+    //append "1" bit to message
+    //append "0" bits until message length in bits ≡ 448 (mod 512)
+    //append length mod (2^64) to message
+
+    for (new_len = initial_len + 1; new_len % (512/8) != 448/8; new_len++)
+        ;
+
+    msg[initial_len] = 0x80; // append the "1" bit; most significant bit is "first"
+    for (offset = initial_len + 1; offset < new_len; offset++)
+        msg[offset] = 0; // append "0" bits
+
+    // append the len in bits at the end of the buffer.
+    to_bytes(initial_len*8, msg + new_len);
+    // initial_len>>29 == initial_len*8>>32, but avoids overflow.
+    to_bytes(initial_len>>29, msg + new_len + 4);
+
+    // Process the message in successive 512-bit chunks:
+    //for each 512-bit chunk of message:
+    for(offset=0; offset<new_len; offset += (512/8)) {
+
+        // break chunk into sixteen 32-bit words w[j], 0 ≤ j ≤ 15
+        for (i = 0; i < 16; i++)
+            w[i] = to_int32(msg + offset + i*4);
+
+        // Initialize hash value for this chunk:
+        a = h0;
+        b = h1;
+        c = h2;
+        d = h3;
+
+        // Main loop:
+        for(i = 0; i<64; i++) {
+
+            if (i < 16) {
+                f = (b & c) | ((~b) & d);
+                g = i;
+            } else if (i < 32) {
+                f = (d & b) | ((~d) & c);
+                g = (5*i + 1) % 16;
+            } else if (i < 48) {
+                f = b ^ c ^ d;
+                g = (3*i + 5) % 16;
+            } else {
+                f = c ^ (b | (~d));
+                g = (7*i) % 16;
+            }
+
+            temp = d;
+            d = c;
+            c = b;
+            b = b + LEFTROTATE((a + f + k[i] + w[g]), r[i]);
+            a = temp;
+
+        }
+
+        // Add this chunk's hash to result so far:
+        h0 += a;
+        h1 += b;
+        h2 += c;
+        h3 += d;
+
+    }
+
+    //var char digest[16] := h0 append h1 append h2 append h3 //(Output is in little-endian)
+    to_bytes(h0, digest);
+    to_bytes(h1, digest + 4);
+    to_bytes(h2, digest + 8);
+    to_bytes(h3, digest + 12);
+}
+
+static uint8_t *str;
+static uint8_t *digest;
+
+void bench_md5_prepare() {
+  N = setting->size;
+  bench_srand(1);
+  str = bench_alloc(N);
+  for (int i = 0; i < N; i ++) {
+    str[i] = bench_rand();
+  }
+  digest = bench_alloc(16);
+}
+
+void bench_md5_run() {
+  md5(str, N, digest);
+}
+
+int bench_md5_validate() {
+  return checksum(digest, digest + 16) == setting->checksum;
+}
--- a/am-kernels/benchmarks/microbench/src/qsort/qsort.c
+++ b/am-kernels/benchmarks/microbench/src/qsort/qsort.c
@ -0,0 +1,44 @@
+#include <benchmark.h>
+
+static int N, *data;
+
+void bench_qsort_prepare() {
+  bench_srand(1);
+
+  N = setting->size;
+
+  data = bench_alloc(N * sizeof(int));
+  for (int i = 0; i < N; i ++) {
+    int a = bench_rand();
+    int b = bench_rand();
+    data[i] = (a << 16) | b;
+  }
+}
+
+static void swap(int *a, int *b) {
+  int t = *a;
+  *a = *b;
+  *b = t;
+}
+
+static void myqsort(int *a, int l, int r) {
+  if (l < r) {
+    int p = a[l], pivot = l, j;
+    for (j = l + 1; j < r; j ++) {
+      if (a[j] < p) {
+        swap(&a[++pivot], &a[j]);
+      }
+    }
+    swap(&a[pivot], &a[l]);
+    myqsort(a, l, pivot);
+    myqsort(a, pivot + 1, r);
+  }
+}
+
+void bench_qsort_run() {
+  myqsort(data, 0, N);
+}
+
+int bench_qsort_validate() {
+  return checksum(data, data + N) == setting->checksum;
+}
--- a/am-kernels/benchmarks/microbench/src/queen/queen.c
+++ b/am-kernels/benchmarks/microbench/src/queen/queen.c
@ -0,0 +1,32 @@
+#include <benchmark.h>
+
+static unsigned int FULL;
+
+static unsigned int dfs(unsigned int row, unsigned int ld, unsigned int rd) {
+  if (row == FULL) {
+    return 1;
+  } else {
+    unsigned int pos = FULL & (~(row | ld | rd)), ans = 0;
+    while (pos) {
+      unsigned int p = (pos & (~pos + 1));
+      pos -= p;
+      ans += dfs(row | p, (ld | p) << 1, (rd | p) >> 1);
+    }
+    return ans;
+  }
+}
+
+static unsigned int ans;
+
+void bench_queen_prepare() {
+  ans = 0;
+  FULL = (1 << setting->size) - 1;
+}
+
+void bench_queen_run() {
+  ans = dfs(0, 0, 0);
+}
+
+int bench_queen_validate() {
+  return ans == setting->checksum;
+}
--- a/am-kernels/benchmarks/microbench/src/sieve/sieve.c
+++ b/am-kernels/benchmarks/microbench/src/sieve/sieve.c
@ -0,0 +1,42 @@
+#include <benchmark.h>
+
+static int N;
+
+static int ans;
+static uint32_t *primes;
+
+static inline int get(int n) {
+  return (primes[n >> 5] >> (n & 31)) & 1;
+}
+
+static inline void clear(int n) {
+  primes[n >> 5] &= ~(1ul << (n & 31));
+}
+
+void bench_sieve_prepare() {
+  N = setting->size;
+  primes = (uint32_t*)bench_alloc(N / 8 + 128);
+  for (int i = 0; i <= N / 32; i ++) {
+    primes[i] = 0xffffffff;
+  }
+}
+
+void bench_sieve_run() {
+  for (int i = 1; i <= N; i ++)
+    if (!get(i)) return;
+  for (int i = 2; i * i <= N; i ++) {
+    if (get(i)) {
+      for (int j = i + i; j <= N; j += i)
+        clear(j);
+    }
+  }
+  ans = 0;
+  for (int i = 2; i <= N; i ++)
+    if (get(i)) {
+      ans ++;
+    }
+}
+
+int bench_sieve_validate() {
+  return ans == setting->checksum;
+}
--- a/am-kernels/benchmarks/microbench/src/ssort/ssort.cc
+++ b/am-kernels/benchmarks/microbench/src/ssort/ssort.cc
@ -0,0 +1,111 @@
+// This is the Skew algorithm's reference implementation.
+
+#include <benchmark.h>
+
+static int N;
+
+inline bool leq(int a1, int a2,   int b1, int b2) { // lexic. order for pairs
+  return(a1 < b1 || (a1 == b1 && a2 <= b2));
+}                                                   // and triples
+inline bool leq(int a1, int a2, int a3,   int b1, int b2, int b3) {
+  return(a1 < b1 || (a1 == b1 && leq(a2,a3, b2,b3)));
+}
+// stably sort a[0..n-1] to b[0..n-1] with keys in 0..K from r
+static void radixPass(int* a, int* b, int* r, int n, int K)
+{ // count occurrences
+  int* c = (int*)bench_alloc(sizeof(int)*(K+1));
+  for (int i = 0;  i <= K;  i++) c[i] = 0;         // reset counters
+  for (int i = 0;  i < n;  i++) c[r[a[i]]]++;    // count occurences
+  for (int i = 0, sum = 0;  i <= K;  i++) { // exclusive prefix sums
+     int t = c[i];  c[i] = sum;  sum += t;
+  }
+  for (int i = 0;  i < n;  i++) b[c[r[a[i]]]++] = a[i];      // sort
+}
+
+// find the suffix array SA of s[0..n-1] in {1..K}^n
+// require s[n]=s[n+1]=s[n+2]=0, n>=2
+void suffixArray(int* s, int* SA, int n, int K) {
+  int n0=(n+2)/3, n1=(n+1)/3, n2=n/3, n02=n0+n2;
+  int* s12  = (int*)bench_alloc(sizeof(int)*(n02+3));  s12[n02]= s12[n02+1]= s12[n02+2]=0;
+  int* SA12 = (int*)bench_alloc(sizeof(int)*(n02+3)); SA12[n02]=SA12[n02+1]=SA12[n02+2]=0;
+  int* s0   = (int*)bench_alloc(sizeof(int)*n0);
+  int* SA0  = (int*)bench_alloc(sizeof(int)*n0);
+
+  // generate positions of mod 1 and mod  2 suffixes
+  // the "+(n0-n1)" adds a dummy mod 1 suffix if n%3 == 1
+  for (int i=0, j=0;  i < n+(n0-n1);  i++) if (i%3 != 0) s12[j++] = i;
+
+  // lsb radix sort the mod 1 and mod 2 triples
+  radixPass(s12 , SA12, s+2, n02, K);
+  radixPass(SA12, s12 , s+1, n02, K);
+  radixPass(s12 , SA12, s  , n02, K);
+
+  // find lexicographic names of triples
+  int name = 0, c0 = -1, c1 = -1, c2 = -1;
+  for (int i = 0;  i < n02;  i++) {
+    if (s[SA12[i]] != c0 || s[SA12[i]+1] != c1 || s[SA12[i]+2] != c2) {
+      name++;  c0 = s[SA12[i]];  c1 = s[SA12[i]+1];  c2 = s[SA12[i]+2];
+    }
+    if (SA12[i] % 3 == 1) { s12[SA12[i]/3]      = name; } // left half
+    else                  { s12[SA12[i]/3 + n0] = name; } // right half
+  }
+
+  // recurse if names are not yet unique
+  if (name < n02) {
+    suffixArray(s12, SA12, n02, name);
+    // store unique names in s12 using the suffix array
+    for (int i = 0;  i < n02;  i++) s12[SA12[i]] = i + 1;
+  } else // generate the suffix array of s12 directly
+    for (int i = 0;  i < n02;  i++) SA12[s12[i] - 1] = i;
+
+  // stably sort the mod 0 suffixes from SA12 by their first character
+  for (int i=0, j=0;  i < n02;  i++) if (SA12[i] < n0) s0[j++] = 3*SA12[i];
+  radixPass(s0, SA0, s, n0, K);
+
+  // merge sorted SA0 suffixes and sorted SA12 suffixes
+  for (int p=0,  t=n0-n1,  k=0;  k < n;  k++) {
+#define GetI() (SA12[t] < n0 ? SA12[t] * 3 + 1 : (SA12[t] - n0) * 3 + 2)
+    int i = GetI(); // pos of current offset 12 suffix
+    int j = SA0[p]; // pos of current offset 0  suffix
+    if (SA12[t] < n0 ?
+        leq(s[i],       s12[SA12[t] + n0], s[j],       s12[j/3]) :
+        leq(s[i],s[i+1],s12[SA12[t]-n0+1], s[j],s[j+1],s12[j/3+n0]))
+    { // suffix from SA12 is smaller
+      SA[k] = i;  t++;
+      if (t == n02) { // done --- only SA0 suffixes left
+        for (k++;  p < n0;  p++, k++) SA[k] = SA0[p];
+      }
+    } else {
+      SA[k] = j;  p++;
+      if (p == n0)  { // done --- only SA12 suffixes left
+        for (k++;  t < n02;  t++, k++) SA[k] = GetI();
+      }
+    }
+  }
+}
+
+extern "C" {
+
+static int *s, *sa;
+
+void bench_ssort_prepare() {
+  N = setting->size;
+  bench_srand(1);
+  s = (int*)bench_alloc(sizeof(int)*(N+10));
+  sa = (int*)bench_alloc(sizeof(int)*(N+10));
+
+  for (int i = 0; i < N; i ++) {
+    s[i] = bench_rand() % 26;
+  }
+}
+
+void bench_ssort_run() {
+  suffixArray(s, sa, N, 26);
+}
+
+int bench_ssort_validate() {
+  return checksum(sa, sa + N) == setting->checksum;
+}
+
+}
+
--- a/am-kernels/kernels/bad-apple/Makefile
+++ b/am-kernels/kernels/bad-apple/Makefile
@ -0,0 +1,23 @@
+VIDEO_ROW = 25
+VIDEO_COL = 80
+AUDIO_FREQ = 44100
+AUDIO_CHANNEL = 1
+
+VIDEO_SRC = bad-apple.mp4
+VIDEO = build/video.frame
+AUDIO = build/audio.pcm
+
+NAME = bad-apple
+SRCS = bad-apple.c resources.S
+include $(AM_HOME)/Makefile
+
+CFLAGS  += -DVIDEO_ROW=$(VIDEO_ROW) -DVIDEO_COL=$(VIDEO_COL) \
+           -DAUDIO_FREQ=$(AUDIO_FREQ) -DAUDIO_CHANNEL=$(AUDIO_CHANNEL)
+ASFLAGS += -DVIDEO_FILE=\"$(abspath $(VIDEO))\" -DAUDIO_FILE=\"$(abspath $(AUDIO))\"
+$(VIDEO):
+	ffmpeg -i $(VIDEO_SRC) -f image2pipe -s $(VIDEO_COL)x$(VIDEO_ROW) -vcodec rawvideo -pix_fmt monow $@
+
+$(AUDIO):
+	ffmpeg -i $(VIDEO_SRC) -vn -acodec pcm_s16le -f s16le -ac $(AUDIO_CHANNEL) -ar $(AUDIO_FREQ) $@
+
+resources.S: $(VIDEO) $(AUDIO)
--- a/Show more
+++ b/Show more
				`@ -0,0 +1 @@`
				`/nix/store/h1glxbcjgw3mv218w2wy73yih6s5p7iz-gdb-13.2`
				`@ -0,0 +1 @@`
				`/nix/store/jn4rd289315ip9fx03z2dm980wzg4iaz-am-kernels-2024.02.18-env`
				`@ -0,0 +1 @@`
				`Unnamed repository; edit this file 'description' to name the repository.`
				`@ -0,0 +1 @@`
				`0000000000000000000000000000000000000000 bb725d6f8223dd7de831c3b692e8c4531e9d01af xinyangli <lixinyang411@gmail.com> 1709436368 +0800 clone: from github.com:NJU-ProjectN/am-kernels.git`
				`@ -0,0 +1 @@`
				xM<>ΑjΓ0D{ΦW<CEA6><57>ƒtpJάC{<15>KLλ$X¦W!KΫDµ-»²T(!<21>^9%<25>Ϋξ,3σ¶ι†§/Ο<0F>—-<2D>ήXΣ‡^8ψΖ<>&yΕ‹ύ?<>”"4Ία”'²OZpΊ‰Ά <ΙJφ–‹Lπ<4C>ν6¬Ϊΰυ<CEB0>"°²ι@t<>ƒ<Ι0γe1VuAΗύ<04>=9γ88<0F>"5Kb4 ¦›HfΗ'Y<>ο¬~έWε—¬Ξ¶9Η+½ω<C2BD>Ε?«²ν<C2B2>Ω zgΊλey.8»$w93΅<33>Ρ±@jV~pΏΔΓδ§G5†δ:Ρ?¬`+