diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 0e24a7f..e902641 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -57,7 +57,7 @@ jobs: working-directory: ${{github.workspace}}/build shell: bash # Execute the build. You can specify a specific target with "--target " - run: cmake --build . --config $BUILD_TYPE -- -j + run: cmake --build . --config $BUILD_TYPE - name: Test working-directory: ${{github.workspace}}/build diff --git a/CMakeLists.txt b/CMakeLists.txt index 53857e3..450aa74 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,7 +18,8 @@ string( add_compile_options("${_opts}") option(USE_RUNTIME_CHECKS "Check arguments and emit warnings" OFF) -option(TRACK_STATISTICS "Tracks statistics such as register reads and energy use" ON) +option(TRACK_STATISTICS "Tracks statistics such as register reads and energy use" OFF) +option(USE_CUDA "Uses CUDA for GPU processing" ON) if (USE_RUNTIME_CHECKS) message(STATUS "With runtime checks") add_definitions(-DUSE_RUNTIME_CHECKS) @@ -29,6 +30,11 @@ if (TRACK_STATISTICS) add_definitions(-DTRACK_STATISTICS) endif () unset(TRACK_STATISTICS CACHE) +if (USE_CUDA) + message(STATUS "Using CUDA") + add_definitions(-DUSE_CUDA) +endif () +unset(USE_CUDA CACHE) IF (UNIX) @@ -49,7 +55,7 @@ ENDIF () set(CONAN_SYSTEM_INCLUDES ON) conan_basic_setup() -add_executable(${SIM_LIB} main.cpp src/simulator/registers/analogue_register.cpp include/simulator/registers/analogue_register.h src/simulator/registers/digital_register.cpp include/simulator/registers/digital_register.h src/simulator/buses/analogue_bus.cpp include/simulator/buses/analogue_bus.h src/simulator/buses/digital_bus.cpp include/simulator/buses/digital_bus.h src/simulator/base/pixel.cpp include/simulator/base/pixel.h src/simulator/base/architecture.cpp include/simulator/base/architecture.h src/simulator/pe/processing_element.cpp include/simulator/pe/processing_element.h src/simulator/units/comparator.cpp include/simulator/units/comparator.h src/simulator/units/squarer.cpp include/simulator/units/squarer.h src/simulator/registers/register.cpp include/simulator/registers/register.h src/simulator/base/component.cpp include/simulator/base/component.h src/simulator/memory/dram/dram3t_cell.cpp include/simulator/memory/dram3t_cell.h src/simulator/memory/sram/sram6t_cell.cpp include/simulator/memory/sram6t_cell.h src/simulator/memory/memory.cpp include/simulator/memory/memory.h src/simulator/memory/si/si_cell.cpp include/simulator/memory/si_cell.h src/simulator/util/utility.cpp include/simulator/util/utility.h src/simulator/metrics/cycle_counter.cpp include/simulator/metrics/cycle_counter.h src/simulator/metrics/json_writer.cpp include/simulator/metrics/json_writer.h include/simulator/ui/async_file_streamer.h include/simulator/ui/async_file_reader.h src/simulator/ui/ui.cpp include/simulator/ui/ui.h src/simulator/ui/file_watcher.cpp include/simulator/ui/file_watcher.h include/simulator/ui/base64_encoder.h src/simulator/ui/src/base64_encoder.cpp include/simulator/input/input_source.h src/simulator/input/live_input.cpp include/simulator/input/live_input.h src/simulator/input/image_input.cpp include/simulator/input/image_input.h src/simulator/input/video_input.cpp include/simulator/input/video_input.h include/simulator/base/plane_params.h src/simulator/base/plane_params.cpp include/simulator/adders/cla.h src/simulator/adders/cla.cpp include/simulator/base/config.h src/simulator/base/config.cpp include/simulator/metrics/stats_outputter.h src/simulator/memory/dram/dram_array.cpp include/simulator/memory/dram_array.h include/simulator/external/parser.h src/simulator/external/parser.cpp scamp5/scamp5.h scamp5/scamp5.cpp scamp5/analognet2/conv_instructions.h scamp5/analognet2/fc_weights.h scamp5/analognet2/analog_main.h scamp5/analognet2/analog_main.cpp scamp5_extended/scamp5_e.cpp scamp5_extended/scamp5_e.h scamp5_multiplexed/scamp5m.cpp scamp5_multiplexed/scamp5m.h include/simulator/alu/alu.h src/simulator/alu/alu.cpp include/simulator/adc/adc.h src/simulator/adc/adc.cpp include/simulator/metrics/packer.h src/simulator/metrics/packer.cpp include/simulator/metrics/pack_node.h src/simulator/metrics/pack_node.cpp scamp5_multiplexed/scamp5rmalt.cpp scamp5_multiplexed/scamp5rmalt.h scamp5_multiplexed/vj_classifier.cpp scamp5_multiplexed/vj_classifier.h scamp5_multiplexed/image.h) +add_executable(${SIM_LIB} main.cpp src/simulator/registers/analogue_register.cpp include/simulator/registers/analogue_register.h src/simulator/registers/digital_register.cpp include/simulator/registers/digital_register.h src/simulator/buses/analogue_bus.cpp include/simulator/buses/analogue_bus.h src/simulator/buses/digital_bus.cpp include/simulator/buses/digital_bus.h src/simulator/base/pixel.cpp include/simulator/base/pixel.h src/simulator/base/architecture.cpp include/simulator/base/architecture.h src/simulator/pe/processing_element.cpp include/simulator/pe/processing_element.h src/simulator/units/comparator.cpp include/simulator/units/comparator.h src/simulator/units/squarer.cpp include/simulator/units/squarer.h src/simulator/registers/register.cpp include/simulator/registers/register.h src/simulator/base/component.cpp include/simulator/base/component.h src/simulator/memory/dram/dram3t_cell.cpp include/simulator/memory/dram3t_cell.h src/simulator/memory/sram/sram6t_cell.cpp include/simulator/memory/sram6t_cell.h src/simulator/memory/memory.cpp include/simulator/memory/memory.h src/simulator/memory/si/si_cell.cpp include/simulator/memory/si_cell.h src/simulator/util/utility.cpp include/simulator/util/utility.h src/simulator/metrics/cycle_counter.cpp include/simulator/metrics/cycle_counter.h src/simulator/metrics/json_writer.cpp include/simulator/metrics/json_writer.h include/simulator/ui/async_file_streamer.h include/simulator/ui/async_file_reader.h src/simulator/ui/ui.cpp include/simulator/ui/ui.h src/simulator/ui/file_watcher.cpp include/simulator/ui/file_watcher.h include/simulator/ui/base64_encoder.h src/simulator/ui/src/base64_encoder.cpp include/simulator/input/input_source.h src/simulator/input/live_input.cpp include/simulator/input/live_input.h src/simulator/input/image_input.cpp include/simulator/input/image_input.h src/simulator/input/video_input.cpp include/simulator/input/video_input.h include/simulator/base/plane_params.h src/simulator/base/plane_params.cpp include/simulator/adders/cla.h src/simulator/adders/cla.cpp include/simulator/base/config.h src/simulator/base/config.cpp include/simulator/metrics/stats_outputter.h src/simulator/memory/dram/dram_array.cpp include/simulator/memory/dram_array.h include/simulator/external/parser.h src/simulator/external/parser.cpp scamp5/scamp5.h scamp5/scamp5.cpp scamp5/analognet2/conv_instructions.h scamp5/analognet2/fc_weights.h scamp5/analognet2/analog_main.h scamp5/analognet2/analog_main.cpp scamp5_extended/scamp5_e.cpp scamp5_extended/scamp5_e.h scamp5_multiplexed/scamp5m.cpp scamp5_multiplexed/scamp5m.h include/simulator/alu/alu.h src/simulator/alu/alu.cpp include/simulator/adc/adc.h src/simulator/adc/adc.cpp include/simulator/metrics/packer.h src/simulator/metrics/packer.cpp include/simulator/metrics/pack_node.h src/simulator/metrics/pack_node.cpp scamp5_multiplexed/scamp5rmalt.cpp scamp5_multiplexed/scamp5rmalt.h scamp5_multiplexed/vj_classifier.cpp scamp5_multiplexed/vj_classifier.h scamp5_multiplexed/image.h include/simulator/base/opencv_wrappers.h ./src/simulator/base/opencv_wrappers.cpp) if (WIN32) string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE) diff --git a/README.md b/README.md index 40346ce..a3aeed1 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ Tracking statistics is expensive and will drastically slow down execution. | ------------- | ------------- | ------------- | | USE_RUNTIME_CHECKS | Checks arguments at runtime and emits warnings. | OFF | | TRACK_STATISTICS | Tracks statistics such as register reads and energy use| OFF | +| USE_CUDA | Uses CUDA for all processing. Obviously a CUDA enabled GPU must be present. Turn TRACK_STATISTICS off because currently there are issues with tracking and CUDA at the same time| OFF | # Build - (release|debug) (remmeber to remove the brackets around the choice before running) @@ -46,15 +47,16 @@ The main fields are explained here: | Field name | Required | Default | Description | | ------------- | ------------- | ------------- | ------------- | +| use_opencl | false | false | Whether to use GPU for processing or CPU. If set and no GPU is available will default to CPU. Uses OpenCL for GPU processing. CUDA can give better performance but this is a compile time option | | architecture | true | N/A | The name of the architecture that is being simulated. The architecture must be registered and should be a subclass of the `Architecture` class. See `SCAMP5.cpp` for an example architecture | | frames | false | 1000 | The number of frames to run the program for. A negative number is interpreted as infinite, i.e. run forever| | frame_time | false | true | Prints the amount of time in ms to process each frame from beginning to end to stdout| | with_stats | false | false | If this is enabled the statistics of the run will be printed at the end. Must be compiled with statistics support for this to do anything| | ui_enabled | false | false | Should the web UI server be enabled| | ui_registers_to_display | true if ui_enabled is set | false | The registers to display on the web UI. Must be defined in the config somewhere and added to the cache (or a property of the architecture)| -| _inherit | false | false | Special field. Sometimes the child component needs to inherit from the parent. This is achieved by use of this special field. The field must be a list of all fields to inherit which have been defined previously. Inheritance is not strictly limited to parents. In fact any property defined previously at any level can be used. This means that if you define the same property in different components only the latest will be preserved.| -| _name | false | false | Special field. Defines the identifier of the component| -| _component | false | false | Specifal field. The component class. Must match exactly so the correct class can be found and constructed| +| _inherit | false | N/A | Special field. Sometimes the child component needs to inherit from the parent. This is achieved by use of this special field. The field must be a list of all fields to inherit which have been defined previously. Inheritance is not strictly limited to parents. In fact any property defined previously at any level can be used. This means that if you define the same property in different components only the latest will be preserved.| +| _name | false | N/A | Special field. Defines the identifier of the component| +| _component | false | N/A | Specifal field. The component class. Must match exactly so the correct class can be found and constructed| The architecture set in the `architecture` field must be defined as an object. This part is hierarchical. You can add define each component that is necessary in the definition of the architecture. diff --git a/examples/multiplex/config.json b/examples/multiplex/config.json index 6ff5313..320f174 100644 --- a/examples/multiplex/config.json +++ b/examples/multiplex/config.json @@ -1,6 +1,7 @@ { "architecture": "SCAMP5M", - "frames": 25, + "frames": 1, + "with_opencl": true, "with_stats": true, "ui_enabled": true, "output_filename": "", @@ -9,7 +10,7 @@ "rows": 256, "cols": 256, "row_stride": 1, - "col_stride": 1, + "col_stride": 256, "origin": "TOP_RIGHT", "config": { "clock_rate": 10000000 @@ -39,8 +40,8 @@ ], "_component": "Dram", "_name": "dram", - "array_rows": 500, - "array_cols": 32 + "array_rows": 256, + "array_cols": 90 }, { "_inherit": [ diff --git a/examples/multiplex/downsample.txt b/examples/multiplex/downsample.txt new file mode 100644 index 0000000..a39775e --- /dev/null +++ b/examples/multiplex/downsample.txt @@ -0,0 +1,3 @@ +//get_image(A, D); +downsample(B, A, 0.5); +//display() diff --git a/examples/multiplex/integral.txt b/examples/multiplex/integral.txt new file mode 100644 index 0000000..071c8a8 --- /dev/null +++ b/examples/multiplex/integral.txt @@ -0,0 +1 @@ +integral_image() diff --git a/examples/scamp5/box3x3.txt b/examples/scamp5/box3x3.txt index d0471d0..a709ff7 100644 --- a/examples/scamp5/box3x3.txt +++ b/examples/scamp5/box3x3.txt @@ -1,5 +1,5 @@ // 3x3 box filter -get_image(A, D) +//get_image(A, D) diva(A, B, C); diva(A, B, C); diva(A, B, C); diff --git a/examples/scamp5/config.json b/examples/scamp5/config.json index 3c830fc..5d90f61 100644 --- a/examples/scamp5/config.json +++ b/examples/scamp5/config.json @@ -1,10 +1,12 @@ { "architecture": "SCAMP5", - "frames": 1, - "frame_time": true, - "with_stats": true, + "frames": 10000, + "use_opencl" : false, + "instr_rep" : 1, + "frame_time": true, + "with_stats": false, "ui_enabled": true, - "ui_registers_to_display" : ["A", "B", "C", "D", "E", "R5", "R6", "R7", "R12"], + "ui_registers_to_display" : ["A", "B", "C", "D", "E", "R5", "R6", "R7", "R12"], "SCAMP5": { "rows": 256, "cols": 256, @@ -13,19 +15,19 @@ "origin" : "TOP_RIGHT", "config": { "process_node" : 180, - "clock_rate" : 1e7 + "clock_rate" : 1e7 }, "components" : [ - { - "_inherit" : ["config", "rows", "cols", "row_stride", "col_stride"], - "_component" : "ProcessingElement", - "_name" : "pe", - "analogue_registers" : ["PIX", "IN", "NEWS", "A", "B", "C", "D", "E", "F"], - "digital_registers" : ["FLAG", "SELECT", "RECT", "R0", "R1", "R2", "R3", "R4", "R5", "R6", "R7", "R8", "R9", "R10", "R11", {"name" : "R12", "mask" : "R0"}], - "pixel" : { + { + "_inherit" : ["config", "rows", "cols", "row_stride", "col_stride"], + "_component" : "ProcessingElement", + "_name" : "pe", + "analogue_registers" : ["PIX", "IN", "NEWS", "A", "B", "C", "D", "E", "F"], + "digital_registers" : ["FLAG", "SELECT", "RECT", "R0", "R1", "R2", "R3", "R4", "R5", "R6", "R7", "R8", "R9", "R10", "R11", {"name" : "R12", "mask" : "R0"}], + "pixel" : { "camera_index" : 0, - "input_source" : "LIVE" - } + "input_source" : "LIVE" + } } ] } diff --git a/examples/scamp5/gaussian3x3.txt b/examples/scamp5/gaussian3x3.txt index fec85c1..d7dc0e1 100644 --- a/examples/scamp5/gaussian3x3.txt +++ b/examples/scamp5/gaussian3x3.txt @@ -1,4 +1,4 @@ -get_image(A, D); +//get_image(A, D); divq(C, A); divq(A, C); divq(C, A); diff --git a/examples/scamp5/gaussian5x5.txt b/examples/scamp5/gaussian5x5.txt index 6f3e82b..c6f253a 100644 --- a/examples/scamp5/gaussian5x5.txt +++ b/examples/scamp5/gaussian5x5.txt @@ -1,4 +1,4 @@ -get_image(A, D); +//get_image(A, D); div(E, C, A); diva(E, C, B); diva(E, C, B); diff --git a/examples/scamp5/horizontal_sobel.txt b/examples/scamp5/horizontal_sobel.txt index a8d5cd1..e92172c 100644 --- a/examples/scamp5/horizontal_sobel.txt +++ b/examples/scamp5/horizontal_sobel.txt @@ -3,4 +3,4 @@ movx(B, A, west); addx(A, B, A, east); addx(B, B, A, south); sub2x(A, B, north, north, B); -display(); +//display(); diff --git a/examples/scamp5/laplacian.txt b/examples/scamp5/laplacian.txt new file mode 100644 index 0000000..5dacfc6 --- /dev/null +++ b/examples/scamp5/laplacian.txt @@ -0,0 +1,11 @@ +//get_image(A, D); +movx(F, A, north); +neg(B, A); +subx(D, F, south, B); +sub(C, D, B); +add(D, D, C); +subx(E, B, south, F); +addx(A, B, E, west); +mov2x(B, A, east, east); +add(C, C, D, E); +add(A, C, A, B); diff --git a/examples/scamp5/lat.txt b/examples/scamp5/lat.txt index 7dac799..dc58d66 100644 --- a/examples/scamp5/lat.txt +++ b/examples/scamp5/lat.txt @@ -1,6 +1,6 @@ // local adaptive thresholding. // works decently with 512x512 resolution images of say book pages -get_image(A, D) +//get_image(A, D) div(B, C, A); diva(B, C, D); diva(B, C, D); diff --git a/examples/scamp5/mean3x3.txt b/examples/scamp5/mean3x3.txt new file mode 100644 index 0000000..884ad0c --- /dev/null +++ b/examples/scamp5/mean3x3.txt @@ -0,0 +1,18 @@ +//get_image(A, D); +div(C, D, A); +diva(C, D, E); +diva(C, D, E); +diva(C, D, E); +diva(C, D, E); +diva(C, D, E); +movx(D, C, south); +movx(E, C, north); +add(A, D, C, E); +add(B, D, C, E); +add(C, D, C, E); +add(B, A, B, C); +add(A, B, A); +add(B, A, B); +movx(A, B, east); +movx(C, B, west); +add(B, B, C, A); diff --git a/examples/scamp5/motion.txt b/examples/scamp5/motion.txt index 42d6952..7a28951 100644 --- a/examples/scamp5/motion.txt +++ b/examples/scamp5/motion.txt @@ -1,5 +1,5 @@ -get_image(C,D); -scamp5_in(E, 15) +//get_image(C,D); +scamp5_in(E, 15); sub(D,C,F); // D = C - F mov(F,C); // F = C abs(B,D); // B = |D| @@ -7,5 +7,4 @@ sub(A,B,E); // A = B - E where(A); // where A > 0 MOV(R5, FLAG); all(); -//motion() -display() +//display() diff --git a/examples/scamp5/multiple_sobel.txt b/examples/scamp5/multiple_sobel.txt index 3d2b2f1..e9d10b4 100644 --- a/examples/scamp5/multiple_sobel.txt +++ b/examples/scamp5/multiple_sobel.txt @@ -8,4 +8,4 @@ addx(C, C, A, north); addx(B, B, D, east); sub2x(A, B, west, west, B); sub2x(B, C, south, south, C); -display() +//display() diff --git a/examples/scamp5/vertical_sobel.txt b/examples/scamp5/vertical_sobel.txt index 02ac4e0..8b3d72f 100644 --- a/examples/scamp5/vertical_sobel.txt +++ b/examples/scamp5/vertical_sobel.txt @@ -1,6 +1,6 @@ -get_image(A, D) +//get_image(A, D) movx(B, A, south); addx(A, B, A, north); addx(B, B, A, east); sub2x(A, B, west, west, B); -display() +//display() diff --git a/include/simulator/adders/cla.h b/include/simulator/adders/cla.h index 5e1a6c6..fbc2be6 100644 --- a/include/simulator/adders/cla.h +++ b/include/simulator/adders/cla.h @@ -23,7 +23,7 @@ class CarryLookAheadAdder : public Component { double calc_width() override; double calc_height() override; #endif - cv::Mat scratch; + cv::UMat scratch; public: CarryLookAheadAdder() = default; diff --git a/include/simulator/base/component.h b/include/simulator/base/component.h index d39bf89..ccf7d14 100644 --- a/include/simulator/base/component.h +++ b/include/simulator/base/component.h @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -22,15 +23,20 @@ class Component : public StatsOutputter { std::shared_ptr fit; protected: - int process_node_ = -1; // process node in nm that the other metrics are defined in terms of. for example + int process_node_ = 180; // process node in nm that the other metrics are defined in terms of. int rows_; // rows of the whole plane int cols_; // cols of the whole plane int row_stride_ = 1; int col_stride_ = 1; std::shared_ptr config_; - cv::Mat internal_mask; // Used to keep track of components in array when stride is not 1, i.e. spaces between components +#ifdef USE_CUDA + cv::cuda::GpuMat internal_mask; // Used to keep track of components in array when stride is not 1, i.e. spaces between components +#else + cv::UMat internal_mask; // Used to keep track of components in array when stride is not 1, i.e. spaces between components +#endif public: + void init(); void calc_internal_mask(); /*Setters*/ @@ -50,12 +56,17 @@ class Component : public StatsOutputter { double dynamic_power_; // in Watts double width_; // in Micrometres double height_; // in Mircometres - cv::Mat array_transistor_count_; - cv::Mat array_static_energy_; - cv::Mat array_dynamic_energy_; + cv::UMat array_static_energy_; + cv::UMat array_dynamic_energy_; // No GPUMat for these two as float ops are not always supported on GPUs but int ops usually are +#ifdef USE_CUDA + cv::cuda::GpuMat array_transistor_count_; +#else + cv::UMat array_transistor_count_; +#endif public: /* Update component with how much time has passed for operation. Used for updating static power. */ virtual void update_static(double time) = 0; + /* Returns the total amount of static energy use */ virtual cv::Mat get_static_energy_array(); /* Returns the total amount of dynamic energy use */ diff --git a/include/simulator/base/opencv_wrappers.h b/include/simulator/base/opencv_wrappers.h new file mode 100644 index 0000000..408e88b --- /dev/null +++ b/include/simulator/base/opencv_wrappers.h @@ -0,0 +1,21 @@ +// +// Created by jm1417 on 29/05/2021. +// + +#ifndef SIMULATOR_OPENCV_WRAPPERS_H +#define SIMULATOR_OPENCV_WRAPPERS_H + +#include +#include + +/*Wrappers around common OpenCV functionality which makes GPU switching easy */ + +namespace ocv_wrappers::arith { + void add(cv::OutputArray dst, cv::InputArray src1, cv::InputArray src2); + void add(cv::OutputArray dst, cv::InputArray src1, cv::InputArray src2, cv::InputArray mask); + +} + + + +#endif //SIMULATOR_OPENCV_WRAPPERS_H diff --git a/include/simulator/base/pixel.h b/include/simulator/base/pixel.h index d3ba444..886b946 100644 --- a/include/simulator/base/pixel.h +++ b/include/simulator/base/pixel.h @@ -35,7 +35,11 @@ class Pixel : public Component { void reset(); void read(Register& reg); - cv::Mat read(); +#ifdef USE_CUDA + cv::cuda::GpuMat& read(); +#else + cv::UMat& read(); +#endif double last_frame_time(); #ifdef TRACK_STATISTICS int calc_transistor_count() override; diff --git a/include/simulator/buses/analogue_bus.h b/include/simulator/buses/analogue_bus.h index b43fbb2..c0d67f0 100644 --- a/include/simulator/buses/analogue_bus.h +++ b/include/simulator/buses/analogue_bus.h @@ -12,7 +12,11 @@ class AnalogueBus { private: - cv::Mat scratch; +#ifdef USE_CUDA + cv::cuda::GpuMat scratch; +#else + cv::UMat scratch; +#endif public: // Analogue Register Transfer diff --git a/include/simulator/external/parser.h b/include/simulator/external/parser.h index 8c5c250..fe83f9b 100644 --- a/include/simulator/external/parser.h +++ b/include/simulator/external/parser.h @@ -22,6 +22,8 @@ class Parser { Parser(); std::vector enums_; ParserCache cache; + int repeat_ = 1; // Number of times to repeat each instruction. Useful for performance testing + public: static Parser& get_instance(){ @@ -37,6 +39,8 @@ class Parser { void set_property(const rttr::type& arch_type, const rttr::variant& arch, const std::string& name, rttr::variant value); rttr::variant create_instance(const std::string& arch_name, json arch_props); void parse_config(std::ifstream& config, std::ifstream& program); + + void setup_processing(json& j); }; #endif //SIMULATOR_PARSER_H diff --git a/include/simulator/input/image_input.h b/include/simulator/input/image_input.h index 96ad13d..01fa336 100644 --- a/include/simulator/input/image_input.h +++ b/include/simulator/input/image_input.h @@ -14,7 +14,11 @@ class ImageInput : public InputSource { public: ImageInput(int rows, int cols, const std::string& path); void read(Register& reg) override; - cv::Mat read() override; +#ifdef USE_CUDA + cv::cuda::GpuMat& read() override; +#else + cv::UMat& read() override; +#endif void reset() override; double last_frame_time() override; }; diff --git a/include/simulator/input/input_source.h b/include/simulator/input/input_source.h index 854d503..e506d70 100644 --- a/include/simulator/input/input_source.h +++ b/include/simulator/input/input_source.h @@ -7,19 +7,29 @@ #include +#include #include enum Source { LIVE, VIDEO, IMAGE }; class InputSource { protected: - cv::Mat frame; +#ifdef USE_CUDA + cv::cuda::GpuMat frame; +#else + cv::UMat frame; +#endif double time_taken = 0; int rows_, cols_ = 0; public: virtual void read(Register& reg) = 0; - virtual cv::Mat read() = 0; +#ifdef USE_CUDA + virtual cv::cuda::GpuMat& read() = 0; +#else + virtual cv::UMat& read() = 0; +#endif + virtual void reset() = 0; virtual double last_frame_time() = 0; diff --git a/include/simulator/input/live_input.h b/include/simulator/input/live_input.h index b7e6539..ddd9558 100644 --- a/include/simulator/input/live_input.h +++ b/include/simulator/input/live_input.h @@ -19,7 +19,11 @@ class LiveInput : public InputSource { LiveInput() = default; void read(Register& reg) override; - cv::Mat read() override; +#ifdef USE_CUDA + cv::cuda::GpuMat& read() override; +#else + cv::UMat& read() override; +#endif void reset() override; double last_frame_time() override; }; diff --git a/include/simulator/input/video_input.h b/include/simulator/input/video_input.h index 9d1adfd..72b89ff 100644 --- a/include/simulator/input/video_input.h +++ b/include/simulator/input/video_input.h @@ -14,7 +14,11 @@ class VideoInput : public LiveInput { public: VideoInput(int rows, int cols, const std::string& path); void read(Register& reg) override; - cv::Mat read() override; +#ifdef USE_CUDA + cv::cuda::GpuMat& read() override; +#else + cv::UMat& read() override; +#endif }; #endif // SIMULATOR_VIDEO_INPUT_H diff --git a/include/simulator/memory/dram3t_cell.h b/include/simulator/memory/dram3t_cell.h index 79958b1..b285be6 100644 --- a/include/simulator/memory/dram3t_cell.h +++ b/include/simulator/memory/dram3t_cell.h @@ -12,39 +12,27 @@ class Dram3tCell : public Memory { RTTR_ENABLE(Memory); + private: - int read_count_ = 0; - int write_count_ = 0; #ifdef TRACK_STATISTICS - int cycle_count_ = 2; // TODO find proper numbers for cycle counts - double dynamic_read_power_; // in Watts for a read - double dynamic_write_power_; // in Watts for a read - double time_; // time in seconds for read/write + int cycle_count_ = 2; // TODO calc proper numbers for cycle counts double refresh_time_ = 0.064; // In S int calc_transistor_count() override; double calc_static() override; double calc_dynamic() override; double calc_width() override; double calc_height() override; - double calc_dynamic_read(); - double calc_dynamic_write(); - cv::Mat scratch; + double calc_dynamic_read() override; + double calc_dynamic_write() override; + cv::UMat scratch; #endif -public: - + public: Dram3tCell() = default; void init(); #ifdef TRACK_STATISTICS - cv::Mat get_dynamic_energy_array() override; - void update_static(double time) override; int get_cycle_count() override; - void read(const cv::_InputOutputArray& mask) override; - void read() override; - void write(const cv::_InputOutputArray& mask) override; - void write() override; - void print_stats(const CycleCounter& counter) override; // void write_stats(const CycleCounter& counter, json& j) override; #endif }; diff --git a/include/simulator/memory/dram_array.h b/include/simulator/memory/dram_array.h index 9ae0220..7d64394 100644 --- a/include/simulator/memory/dram_array.h +++ b/include/simulator/memory/dram_array.h @@ -42,7 +42,7 @@ class Dram : public Component { void print_row(int array, int row); - void reset(); + void reset_val(); void update_dynamic(int count); diff --git a/include/simulator/memory/memory.h b/include/simulator/memory/memory.h index 76c3e53..2d4a5a5 100644 --- a/include/simulator/memory/memory.h +++ b/include/simulator/memory/memory.h @@ -16,15 +16,36 @@ enum MemoryType { }; class Memory : public Component { + RTTR_ENABLE(Component); + protected: + int read_count_ = 0; + int write_count_ = 0; + double dynamic_read_power_; // in Watts for a read + double dynamic_write_power_; // in Watts for a read + double time_; // time in seconds for a read/write +#ifdef USE_CUDA + cv::cuda::GpuMat scratch; +#else + cv::UMat scratch; +#endif + public: Memory() = default; - virtual void init() = 0; + virtual void init(); #ifdef TRACK_STATISTICS - virtual void read(const cv::_InputOutputArray &mask) = 0; - virtual void read() = 0; - virtual void write(const cv::_InputOutputArray &mask) = 0; - virtual void write() = 0; + virtual void read(const cv::_InputOutputArray &mask); + virtual void read(); + virtual void write(const cv::_InputOutputArray &mask); + virtual void write(); + + virtual double calc_dynamic_read() = 0; + virtual double calc_dynamic_write() = 0; + + cv::Mat get_dynamic_energy_array() override; + void update_static(double time) override; + int get_cycle_count() override; + void print_stats(const CycleCounter& counter) override; #endif static std::shared_ptr construct(MemoryType memory_type, int rows, int cols, int row_stride, int col_stride, const std::shared_ptr& config); }; diff --git a/include/simulator/memory/si_cell.h b/include/simulator/memory/si_cell.h index bba9a5f..36906df 100644 --- a/include/simulator/memory/si_cell.h +++ b/include/simulator/memory/si_cell.h @@ -9,39 +9,21 @@ class SiCell : public Memory { RTTR_ENABLE(Memory); - private: - int read_count_ = 0; - int write_count_ = 0; #ifdef TRACK_STATISTICS int cycle_count_ = 1; // TODO find proper numbers for cycle counts - double dynamic_read_power_; // in Watts for a read - double dynamic_write_power_; // in Watts for a read - double time_; // time in seconds for a read/write int calc_transistor_count() override; double calc_static() override; double calc_dynamic() override; - double calc_dynamic_read(); - double calc_dynamic_write(); + double calc_dynamic_read() override; + double calc_dynamic_write() override; double calc_width() override; double calc_height() override; - cv::Mat scratch; #endif public: SiCell() = default; void init(); -#ifdef TRACK_STATISTICS - cv::Mat get_dynamic_energy_array() override; - void update_static(double time) override; - int get_cycle_count() override; - void read(const cv::_InputOutputArray& mask) override; - void read() override; - void write(const cv::_InputOutputArray& mask) override; - void write() override; - void print_stats(const CycleCounter& counter) override; -// void write_stats(const CycleCounter& counter, json& j) override; -#endif }; #endif // SIMULATOR_SI_CELL_H diff --git a/include/simulator/memory/sram6t_cell.h b/include/simulator/memory/sram6t_cell.h index 9f0d21f..29dc381 100644 --- a/include/simulator/memory/sram6t_cell.h +++ b/include/simulator/memory/sram6t_cell.h @@ -13,21 +13,16 @@ class Sram6tCell : public Memory { RTTR_ENABLE(Memory); private: - int read_count_ = 0; - int write_count_ = 0; #ifdef TRACK_STATISTICS int cycle_count_ = 1; // TODO find proper numbers for cycle counts - double dynamic_read_power_; // in Watts for a read - double dynamic_write_power_; // in Watts for a read - double time_; // Time in seconds it takes for read/write int calc_transistor_count() override; double calc_static() override; double calc_dynamic() override; - double calc_dynamic_read(); - double calc_dynamic_write(); + double calc_dynamic_read() override; + double calc_dynamic_write() override; double calc_width() override; double calc_height() override; - cv::Mat scratch; + cv::UMat scratch; #endif public: @@ -35,15 +30,7 @@ class Sram6tCell : public Memory { void init(); #ifdef TRACK_STATISTICS - cv::Mat get_dynamic_energy_array() override; - void update_static(double time) override; int get_cycle_count() override; - void read(const cv::_InputOutputArray& mask) override; - void read() override; - void write(const cv::_InputOutputArray& mask) override; - void write() override; - void print_stats(const CycleCounter& counter) override; -// void write_stats(const CycleCounter& counter, json& j) override; #endif }; diff --git a/include/simulator/registers/analogue_register.h b/include/simulator/registers/analogue_register.h index 409805e..4a2f577 100644 --- a/include/simulator/registers/analogue_register.h +++ b/include/simulator/registers/analogue_register.h @@ -14,6 +14,11 @@ class AnalogueRegister : public Register { AnalogueRegister(int rows, int cols, const std::shared_ptr& config, int row_stride = 1, int col_stride = 1, MemoryType memory = MemoryType::S2I); AnalogueRegister(int rows, int cols, int row_stride = 1, int col_stride = 1); AnalogueRegister(const cv::Mat& data, int row_stride = 1, int col_stride = 1); +#ifdef USE_CUDA + AnalogueRegister(const cv::cuda::GpuMat& data, int row_stride = 1, int col_stride = 1); +#else + AnalogueRegister(const cv::UMat& data, int row_stride = 1, int col_stride = 1); +#endif AnalogueRegister& operator()(const std::string& name); diff --git a/include/simulator/registers/digital_register.h b/include/simulator/registers/digital_register.h index 8f71188..cbb8efd 100644 --- a/include/simulator/registers/digital_register.h +++ b/include/simulator/registers/digital_register.h @@ -12,17 +12,30 @@ class DigitalRegister : public Register { RTTR_ENABLE(Register); - std::shared_ptr mask_ = std::make_shared(cv::noArray().getMat_()); +#ifdef USE_CUDA + std::shared_ptr mask_ = std::make_shared(cv::noArray()); +#else + std::shared_ptr mask_ = std::make_shared(cv::noArray().getUMat()); +#endif public: DigitalRegister(int rows, int columns, const std::shared_ptr& config, int row_stride = 1, int col_stride = 1, MemoryType memory_type = MemoryType::DRAM3T); DigitalRegister(int rows, int cols, int row_stride = 1, int col_stride = 1); +#ifdef USE_CUDA + DigitalRegister(const cv::cuda::GpuMat& data, int row_stride = 1, int col_stride = 1); +#else + DigitalRegister(const cv::UMat& data, int row_stride = 1, int col_stride = 1); +#endif DigitalRegister(const cv::Mat& data, int row_stride = 1, int col_stride = 1); DigitalRegister& operator()(const std::string& name); void set_mask(const std::shared_ptr& mask); - cv::Mat& get_mask(); +#ifdef USE_CUDA + cv::cuda::GpuMat& get_mask(); +#else + cv::UMat& get_mask(); +#endif void set(); void clear(); diff --git a/include/simulator/registers/register.h b/include/simulator/registers/register.h index 4e09bf9..ea1c35d 100644 --- a/include/simulator/registers/register.h +++ b/include/simulator/registers/register.h @@ -6,12 +6,12 @@ #define SIMULATOR_REGISTER_H #include +#include +#include #include "simulator/base/component.h" -#include "simulator/metrics/cycle_counter.h" #include "simulator/memory/memory.h" - -#include +#include "simulator/metrics/cycle_counter.h" class Register : public Component { RTTR_ENABLE(); @@ -20,7 +20,11 @@ class Register : public Component { int type_; private: - cv::Mat value_; +#ifdef USE_CUDA + cv::cuda::GpuMat value_; +#else + cv::UMat value_; +#endif public: std::string name_; @@ -50,19 +54,46 @@ class Register : public Component { double get_width() override; double get_height() override; int get_transistor_count() override; + #endif void set_memory(MemoryType memory_type); void set_type(int type); - cv::Mat& read(); - void write(cv::Mat& data); +#ifdef USE_CUDA + cv::cuda::GpuMat& read(); +#else + cv::UMat& read(); +#endif +#ifdef USE_CUDA + void write(cv::cuda::GpuMat& data); +#else + void write(cv::UMat& data); +#endif +#ifdef USE_CUDA + void write(const cv::cuda::GpuMat& data); +#else + void write(const cv::UMat& data); +#endif void write(const cv::Mat& data); - void write(cv::Mat& data, cv::Mat& mask); +#ifdef USE_CUDA + void write(cv::cuda::GpuMat& data, cv::cuda::GpuMat& mask); +#else + void write(cv::UMat& data, cv::UMat& mask); +#endif +#ifdef USE_CUDA + void write(cv::cuda::GpuMat& data, cv::Mat& mask); +#else + void write(cv::UMat& data, cv::Mat& mask); +#endif void write(Register& data); void write(Register& data, Register& mask); void write(int data); - void write(int data, cv::Mat& mask); +#ifdef USE_CUDA + void write(int data, cv::cuda::GpuMat& mask); +#else + void write(int data, cv::UMat& mask); +#endif void write(int data, Register& mask); }; diff --git a/include/simulator/util/utility.h b/include/simulator/util/utility.h index 9a997f9..d44e491 100644 --- a/include/simulator/util/utility.h +++ b/include/simulator/util/utility.h @@ -45,10 +45,16 @@ void on_mouse_reg(int event, int x, int y, int, void* reg) { if(event != cv::EVENT_MOUSEMOVE) return; auto* dr = static_cast(reg); + cv::Mat m; +#ifdef USE_CUDA + dr->read().download(m); +#else + m = dr->read().getMat(cv::ACCESS_READ); +#endif if(y < 0 || x < 0 || y > dr->read().rows || x > dr->read().cols) return; std::cout << "(" << x << ", " << y << ") ...... " - << (int) dr->read().at(y, x) << '\n'; + << (int) m.at(y, x) << '\n'; } template diff --git a/misc/check.py b/misc/check.py deleted file mode 100644 index 935aa3f..0000000 --- a/misc/check.py +++ /dev/null @@ -1,15 +0,0 @@ -first = "0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10 10 10 10 9 10 9 9 9 9 9 9 8 9 9 9 9 9 9 9 9 10 10 10 9 8 7 6 5 5 6 5 5 6 7 8 9 10 13 15 17 17 17 17 17 15 13 13 12 10 9 9 7 8 11 12 11 6 3 3 3 4 8 16 49 49 47 46 46 46 45 45 45 45 44 43 42 42 41 42 43 42 41 41 42 43 46 44 42 38 34 32 31 32 33 35 37 37 39 41 45 50 59 64 68 68 68 68 66 60 54 54 52 46 43 43 40 41 47 51 46 33 21 20 22 24 38 65 49 48 47 45 45 45 44 44 43 43 43 43 42 42 41 42 43 41 40 38 39 40 41 39 36 32 32 33 35 38 38 40 41 40 39 42 45 53 63 69 72 68 63 61 59 57 54 55 50 45 44 46 46 44 45 45 41 29 20 20 22 22 37 66 46 46 45 44 43 43 43 43 43 42 42 42 42 41 41 40 39 37 35 34 34 35 36 35 33 32 34 38 40 42 40 43 41 38 37 38 42 54 66 71 70 66 60 57 56 56 55 56 53 51 51 52 53 49 45 39 35 27 20 20 22 21 36 65 45 43 43 42 41 41 41 41 41 40 40 40 39 38 38 37 35 31 29 28 29 29 33 36 38 38 38 42 44 45 43 42 40 40 32 30 37 53 68 72 68 63 61 59 59 59 59 59 59 64 69 72 67 58 49 41 35 27 20 20 22 22 35 64 44 40 39 39 38 38 38 38 38 39 39 37 35 34 32 29 28 25 23 25 28 31 36 38 39 40 43 46 52 50 50 54 51 47 41 36 37 52 70 72 70 67 65 63 63 62 62 63 64 70 77 85 79 66 52 45 38 27 20 20 22 22 35 64 39 37 39 38 37 36 36 37 37 36 36 33 31 27 26 23 22 22 23 25 30 35 39 44 43 45 52 53 55 59 65 66 79 80 72 62 59 60 71 76 73 69 69 67 65 63 64 64 64 69 78 85 85 76 55 49 42 30 20 20 23 23 34 64 41 42 44 42 41 39 38 38 37 36 33 28 25 21 21 22 23 26 28 29 32 36 43 48 48 51 54 66 69 69 77 93 99 99 86 80 82 79 69 71 74 71 72 72 69 66 65 65 64 62 64 73 78 73 60 54 49 35 21 21 24 24 34 64 46 46 47 45 43 40 38 36 37 42 40 30 25 22 23 26 27 31 35 35 34 37 43 49 51 56 67 81 83 84 81 85 101 106 97 80 73 76 95 86 68 73 74 73 73 70 68 66 64 56 51 58 69 68 65 64 57 40 23 20 24 24 34 64 52 46 43 41 38 34 31 29 37 62 76 60 32 24 25 27 29 36 42 42 37 36 44 51 55 64 80 92 99 86 65 73 89 85 89 97 89 78 68 88 88 68 66 71 74 73 72 69 65 55 44 48 62 71 79 82 72 45 26 20 24 24 33 64 46 36 32 30 28 25 21 21 39 84 105 90 53 33 34 31 29 35 39 39 37 40 52 59 64 70 90 91 88 74 61 66 71 63 71 79 95 102 83 73 83 87 76 70 73 75 75 73 68 55 42 44 57 76 93 98 86 53 27 20 24 24 34 64 35 24 22 22 20 20 17 19 39 84 105 91 57 38 38 37 26 27 32 32 36 47 61 67 69 81 100 82 74 72 71 68 67 64 62 57 60 81 88 75 68 84 89 82 72 74 75 77 70 58 41 38 52 74 93 101 91 63 30 21 23 25 35 63 20 19 19 19 18 18 17 19 35 68 88 74 43 34 35 33 23 17 18 24 43 60 68 69 69 77 87 89 91 94 94 87 77 80 75 58 49 44 60 68 67 67 78 83 78 68 75 76 73 62 43 35 46 64 84 97 91 72 36 21 24 26 34 64 18 18 18 18 19 18 18 19 30 55 68 57 32 25 27 23 18 16 19 34 61 69 71 71 74 85 93 103 120 127 127 127 127 109 112 104 80 54 46 51 63 56 62 76 80 71 66 74 75 64 44 33 43 59 73 89 91 82 43 21 24 26 35 65 20 21 21 19 20 20 18 18 27 50 63 56 37 23 21 19 19 18 29 54 74 70 69 68 75 87 103 127 127 127 127 127 127 127 127 127 127 115 74 49 56 64 65 67 77 74 68 67 73 63 46 37 45 58 68 84 90 83 45 22 25 26 36 66 22 23 22 20 20 20 19 19 35 59 78 68 45 24 21 19 18 23 42 74 80 69 69 74 77 95 121 127 127 127 127 127 127 127 127 127 127 127 127 95 65 64 66 59 74 77 70 61 66 63 46 40 43 56 67 78 83 75 44 22 25 29 41 70 23 24 22 20 19 19 19 23 52 83 104 95 67 33 22 22 23 31 70 86 73 65 64 74 78 98 127 127 127 127 127 127 127 127 127 127 127 127 127 127 94 67 62 62 65 75 73 59 59 61 50 44 47 56 65 69 68 61 38 21 27 30 43 72 24 24 22 20 20 20 21 29 67 104 123 114 86 39 20 21 27 54 85 80 65 54 64 71 69 102 127 127 127 127 127 127 127 127 127 127 127 127 127 127 125 79 60 57 69 70 73 61 55 61 52 43 56 68 72 69 60 45 29 22 26 30 42 70 24 24 22 21 22 22 23 32 71 108 125 113 81 38 20 25 37 69 82 69 55 57 63 61 83 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 97 60 46 58 76 70 70 54 56 53 60 75 84 82 72 61 45 24 22 26 29 39 66 23 24 24 24 24 24 23 30 63 93 108 94 63 30 25 31 51 80 76 59 52 60 58 70 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 109 64 32 41 75 72 72 60 55 56 66 83 89 85 77 74 61 32 22 26 27 36 64 24 26 26 26 24 24 23 25 46 75 86 75 44 24 30 40 75 81 72 46 55 55 54 108 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 83 40 36 66 76 66 67 55 54 61 72 78 75 80 80 72 44 23 24 26 34 63 27 27 28 26 24 24 23 25 43 74 89 82 50 32 35 63 80 78 54 48 45 47 65 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 109 46 45 53 81 63 66 59 54 56 53 59 66 72 75 69 42 20 25 26 32 61 28 31 29 25 24 25 25 30 56 90 109 102 73 47 47 71 78 72 43 35 33 42 123 127 127 127 127 127 123 127 127 127 127 127 127 127 127 127 127 127 127 127 111 45 53 55 77 67 63 64 57 50 49 50 57 59 58 49 34 23 26 27 33 61 52 53 35 27 27 27 27 35 64 103 124 115 99 94 83 83 77 63 33 18 22 71 127 127 127 127 127 127 127 126 127 127 127 127 127 127 127 127 119 125 127 127 109 48 61 61 67 66 60 62 56 48 51 50 56 55 50 51 50 39 27 29 34 61 86 84 58 27 25 29 29 37 77 123 127 124 116 118 101 81 73 53 35 18 30 117 127 127 127 127 127 127 127 127 127 127 127 127 127 127 121 118 127 127 127 106 96 55 64 63 62 67 62 60 50 52 49 58 67 69 71 76 71 62 34 29 34 62 94 91 65 32 27 28 28 53 113 127 127 110 101 107 95 80 72 46 46 23 48 127 127 127 127 127 117 107 127 127 127 127 127 127 127 127 127 127 127 127 127 127 94 61 67 65 59 69 59 61 50 51 49 60 73 94 104 99 75 60 34 28 33 61 84 79 50 29 27 27 28 68 127 127 127 79 70 85 91 79 66 52 56 37 64 127 127 127 127 127 127 108 127 127 127 127 127 127 127 127 127 127 127 120 118 119 104 68 66 62 63 68 59 58 52 49 52 56 72 114 127 122 72 39 29 26 32 61 64 58 40 28 27 26 27 73 127 127 127 57 57 74 85 78 68 60 59 49 80 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 92 114 89 121 115 68 64 64 62 66 62 57 56 48 47 54 70 122 127 127 73 26 27 29 33 62 64 62 45 30 28 28 28 73 127 127 127 63 72 85 82 77 72 71 63 54 89 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 125 117 116 127 127 68 62 62 63 64 65 54 59 48 46 53 65 111 127 127 76 23 28 29 33 62 68 67 48 32 30 29 30 72 127 127 127 70 82 89 78 77 76 78 63 56 92 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 68 59 55 64 59 67 53 55 52 46 53 62 99 127 127 80 25 29 29 33 61 70 69 51 36 33 29 27 68 127 127 124 71 89 93 82 83 81 78 54 61 97 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 69 49 49 60 61 66 53 55 58 50 48 60 87 127 127 80 25 28 29 33 61 72 67 52 47 42 33 27 65 127 127 116 76 92 92 83 84 82 80 59 62 101 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 61 35 45 58 60 61 50 53 56 52 45 57 87 127 127 78 25 29 30 33 62 82 72 57 56 52 42 28 60 127 127 115 84 94 89 83 85 85 80 63 75 115 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 116 45 32 42 58 60 56 47 52 55 49 43 57 92 121 127 78 26 27 32 36 63 84 71 65 68 62 53 39 65 127 127 116 90 92 85 82 85 86 81 68 76 79 127 127 127 127 127 127 127 127 127 127 127 127 109 120 127 127 127 127 127 127 127 95 34 35 45 55 66 54 50 52 51 52 44 54 91 107 125 75 38 42 54 59 78 99 84 86 96 94 89 76 87 126 127 114 94 92 84 88 88 86 82 71 86 41 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 125 73 39 40 49 62 65 54 53 47 52 50 43 51 75 89 98 68 53 66 82 89 98 108 93 102 119 125 121 107 99 117 119 111 99 90 84 87 87 84 81 76 127 88 117 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 109 55 44 54 54 61 68 57 51 52 53 48 41 48 63 73 71 62 63 79 94 103 109 111 96 112 127 127 127 120 97 97 100 101 99 89 83 82 86 87 80 71 78 47 86 127 127 127 127 127 127 127 127 127 127 127 127 127 107 101 127 127 127 127 97 39 54 68 62 58 68 64 55 55 53 46 42 43 54 60 61 60 65 78 90 99 107 117 105 113 127 127 127 117 88 81 89 94 99 89 83 78 85 87 79 75 57 38 49 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 126 82 31 60 80 67 65 66 73 60 56 56 48 42 41 47 56 59 62 64 73 81 86 98 114 106 115 120 125 127 109 90 92 103 103 99 89 80 76 84 87 81 77 58 43 39 97 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 117 50 36 63 81 70 67 69 78 68 57 55 52 47 43 46 59 62 59 56 60 63 66 80 127 122 112 112 115 118 112 108 126 127 127 105 90 78 76 84 87 79 75 63 41 41 44 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 124 77 28 45 60 76 72 73 65 77 78 61 56 55 54 48 48 59 65 63 49 39 40 40 57 127 127 124 109 109 126 127 127 127 127 127 116 91 74 72 84 85 78 72 65 46 41 36 62 127 127 127 127 127 127 127 127 127 127 127 127 127 127 126 89 36 39 53 60 66 70 75 64 72 79 65 60 55 52 52 49 59 66 62 44 27 25 25 41 127 127 127 120 112 127 127 127 127 127 127 118 89 72 70 76 79 73 68 61 49 45 37 32 92 127 127 127 127 127 127 127 127 127 127 127 127 123 90 43 36 57 62 57 55 65 67 60 70 78 69 61 59 54 57 51 59 59 53 40 27 25 27 46 127 127 127 127 127 127 127 127 127 127 127 104 81 72 66 64 76 69 60 56 48 47 39 33 75 109 127 127 127 127 127 127 127 127 127 127 108 80 48 31 61 74 66 58 54 59 62 58 71 74 71 65 64 58 56 56 54 50 38 28 25 28 46 66 127 127 127 127 127 127 127 127 127 127 121 92 75 69 62 55 69 63 54 54 44 49 39 33 67 107 113 127 127 127 127 127 127 127 127 98 66 52 42 47 77 79 64 69 65 60 66 52 71 71 74 68 63 64 60 60 52 43 28 24 29 47 74 90 127 127 127 127 127 127 127 127 127 127 112 81 69 64 57 52 60 56 48 46 37 44 39 30 57 113 119 113 118 119 116 117 118 104 81 68 68 65 46 69 82 75 66 84 73 65 64 44 71 72 75 70 61 68 70 62 56 44 31 74 110 117 127 127 127 127 127 127 127 127 127 127 127 127 97 73 65 59 53 51 58 51 41 42 35 30 37 32 45 111 126 121 117 112 103 91 82 81 84 90 89 67 61 82 74 66 76 92 74 70 66 38 69 69 72 70 53 67 76 73 64 52 52 127 127 127 127 127 117 127 127 127 127 127 127 127 127 127 84 65 64 55 50 50 56 47 37 37 32 24 27 29 36 108 127 127 127 125 116 106 97 97 99 101 87 63 79 76 68 67 79 86 70 74 62 39 70 70 70 69 54 67 78 73 76 68 82 127 127 127 127 127 122 127 127 127 127 127 127 127 127 118 83 66 59 48 46 46 51 42 36 37 28 22 19 25 31 103 127 127 127 127 127 119 111 109 112 109 84 78 86 74 63 75 82 78 66 72 51 41 68 68 63 62 48 65 75 85 93 94 113 127 127 127 127 127 127 127 127 127 127 127 127 127 127 115 81 63 52 43 42 47 42 38 34 35 28 19 19 20 26 94 127 127 127 127 127 127 126 122 124 115 92 86 81 67 65 77 79 69 69 66 36 47 64 63 56 44 40 60 87 115 125 110 115 127 127 127 127 127 127 127 127 127 127 127 127 127 127 107 74 55 45 40 39 42 36 37 33 32 24 16 18 17 36 98 127 127 127 127 127 127 127 127 127 116 97 88 75 67 70 73 73 70 63 48 30 51 56 58 41 30 42 64 110 127 127 127 127 127 127 127 127 127 122 116 112 127 127 127 127 127 115 94 64 50 43 39 38 35 37 33 28 27 21 18 17 21 65 126 127 127 127 127 127 127 127 127 127 121 101 89 68 71 72 68 65 63 51 29 32 49 53 50 31 29 43 79 108 127 127 127 127 127 127 127 127 127 119 119 123 127 127 127 127 105 88 71 55 39 39 40 29 34 32 31 30 26 18 15 21 32 104 127 127 127 127 127 127 127 127 127 127 127 104 87 68 69 68 63 58 44 37 25 32 49 45 31 27 32 48 90 121 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 96 73 62 51 36 36 34 26 30 32 34 28 21 21 23 32 42 121 127 127 127 127 127 127 127 127 127 127 111 85 84 71 71 65 57 41 29 27 23 27 42 31 31 30 42 59 92 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 113 94 66 46 39 36 36 37 40 47 53 63 78 56 39 43 46 84 127 127 127 127 127 127 127 127 94 57 60 81 78 71 71 59 44 28 26 30 28 31 35 27 38 44 55 77 92 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 91 67 51 74 100 114 111 106 102 110 119 105 52 49 49 51 53 57 68 87 96 89 76 59 50 49 53 61 81 73 65 59 42 43 30 32 32 35 35 33 35 46 57 60 79 100 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 81 99 125 127 127 127 127 127 127 127 127 112 50 49 52 51 54 54 52 52 51 50 50 49 50 54 57 65 79 67 65 51 49 60 42 45 42 40 35 38 41 54 67 71 81 116 123 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 125 127 127 127 127 127 127 127 127 127 127 127 111 52 54 56 54 54 55 54 55 54 53 53 53 55 59 56 67 77 69 60 56 61 71 57 57 48 44 47 47 53 62 77 78 79 124 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 70 50 53 56 57 57 56 59 59 57 55 55 55 56 59 51 67 76 65 58 64 69 69 71 61 56 52 47 55 68 67 79 83 81 116 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 55 52 53 57 54 52 52 56 58 57 52 54 54 56 56 57 64 69 61 56 67 65 67 67 65 60 56 58 67 74 74 77 83 82 102 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 107 56 53 54 57 57 53 52 53 56 59 55 56 56 56 54 62 64 64 56 58 68 66 63 71 64 57 54 64 67 76 82 76 84 88 90 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 127 69 53 54 56 59 59 51 53 54 56 57 58 54 55 54 54 66 64 62 59 62 67 57 67 75 65 55 59 66 71 72 80 81 82 91 85 125 127 127 127 127 127 127 127 127 127" - - -second = "0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10 10 10 10 9 10 9 9 9 9 9 9 8 9 9 9 9 9 9 9 9 10 10 10 9 8 7 6 5 5 6 5 5 6 7 8 9 10 13 15 17 17 17 17 17 15 13 13 12 10 9 9 7 8 11 12 11 6 3 3 3 4 8 16 49 49 47 46 46 46 45 45 45 45 44 43 42 42 41 42 43 42 41 41 42 43 46 44 42 38 34 32 31 32 33 35 37 37 39 41 45 50 59 64 68 68 68 68 66 60 54 54 52 46 43 43 40 41 47 51 46 33 21 20 22 24 38 65 49 48 47 45 45 45 44 44 43 43 43 43 42 42 41 42 43 41 40 38 39 40 41 39 36 32 32 33 35 38 38 40 41 40 39 42 45 53 63 69 72 68 63 61 59 57 54 55 50 45 44 46 46 44 45 45 41 29 20 20 22 22 37 66 46 46 45 44 43 43 43 43 43 42 42 42 42 41 41 40 39 37 35 34 34 35 36 35 33 32 34 38 40 42 40 43 41 38 37 38 42 54 66 71 70 66 60 57 56 56 55 56 53 51 51 52 53 49 45 39 35 27 20 20 22 21 36 65 45 43 43 42 41 41 41 41 41 40 40 40 39 38 38 37 35 31 29 28 29 29 33 36 38 38 38 42 44 45 43 42 40 40 32 30 37 53 68 72 68 63 61 59 59 59 59 59 59 64 69 72 67 58 49 41 35 27 20 20 22 22 35 64 44 40 39 39 38 38 38 38 38 39 39 37 35 34 32 29 28 25 23 25 28 31 36 38 39 40 43 46 52 50 50 54 51 47 41 36 37 52 70 72 70 67 65 63 63 62 62 63 64 70 77 85 79 66 52 45 38 27 20 20 22 22 35 64 39 37 39 38 37 36 36 37 37 36 36 33 31 27 26 23 22 22 23 25 30 35 39 44 43 45 52 53 55 59 65 66 79 80 72 62 59 60 71 76 73 69 69 67 65 63 64 64 64 69 78 85 85 76 55 49 42 30 20 20 23 23 34 64 41 42 44 42 41 39 38 38 37 36 33 28 25 21 21 22 23 26 28 29 32 36 43 48 48 51 54 66 69 69 77 93 99 99 86 80 82 79 69 71 74 71 72 72 69 66 65 65 64 62 64 73 78 73 60 54 49 35 21 21 24 24 34 64 46 46 47 45 43 40 38 36 37 42 40 30 25 22 23 26 27 31 35 35 34 37 43 49 51 56 67 81 83 84 81 85 101 106 97 80 73 76 95 86 68 73 74 73 73 70 68 66 64 56 51 58 69 68 65 64 57 40 23 20 24 24 34 64 52 46 43 41 38 34 31 29 37 62 76 60 32 24 25 27 29 36 42 42 37 36 44 51 55 64 80 92 99 86 65 73 89 85 89 97 89 78 68 88 88 68 66 71 74 73 72 69 65 55 44 48 62 71 79 82 72 45 26 20 24 24 33 64 46 36 32 30 28 25 21 21 39 84 105 90 53 33 34 31 29 35 39 39 37 40 52 59 64 70 90 91 88 74 61 66 71 63 71 79 95 102 83 73 83 87 76 70 73 75 75 73 68 55 42 44 57 76 93 98 86 53 27 20 24 24 34 64 35 24 22 22 20 20 17 19 39 84 105 91 57 38 38 37 26 27 32 32 36 47 61 67 69 81 100 82 74 72 71 68 67 64 62 57 60 81 88 75 68 84 89 82 72 74 75 77 70 58 41 38 52 74 93 101 91 63 30 21 23 25 35 63 20 19 19 19 18 18 17 19 35 68 88 74 43 34 35 33 23 17 18 24 43 60 68 69 69 77 87 89 91 94 94 87 77 80 75 58 49 44 60 68 67 67 78 83 78 68 75 76 73 62 43 35 46 64 84 97 91 72 36 21 24 26 34 64 18 18 18 18 19 18 18 19 30 55 68 57 32 25 27 23 18 16 19 34 61 69 71 71 74 85 93 103 120 136 149 145 127 109 112 104 80 54 46 51 63 56 62 76 80 71 66 74 75 64 44 33 43 59 73 89 91 82 43 21 24 26 35 65 20 21 21 19 20 20 18 18 27 50 63 56 37 23 21 19 19 18 29 54 74 70 69 68 75 87 103 127 165 198 210 208 196 173 164 161 146 115 74 49 56 64 65 67 77 74 68 67 73 63 46 37 45 58 68 84 90 83 45 22 25 26 36 66 22 23 22 20 20 20 19 19 35 59 78 68 45 24 21 19 18 23 42 74 80 69 69 74 77 95 121 162 195 215 223 225 226 219 210 208 199 181 147 95 65 64 66 59 74 77 70 61 66 63 46 40 43 56 67 78 83 75 44 22 25 29 41 70 23 24 22 20 19 19 19 23 52 83 104 95 67 33 22 22 23 31 70 86 73 65 64 74 78 98 141 187 209 220 228 231 228 226 223 218 214 203 187 150 94 67 62 62 65 75 73 59 59 61 50 44 47 56 65 69 68 61 38 21 27 30 43 72 24 24 22 20 20 20 21 29 67 104 123 114 86 39 20 21 27 54 85 80 65 54 64 71 69 102 173 205 211 218 224 228 227 224 222 220 216 208 198 177 125 79 60 57 69 70 73 61 55 61 52 43 56 68 72 69 60 45 29 22 26 30 42 70 24 24 22 21 22 22 23 32 71 108 125 113 81 38 20 25 37 69 82 69 55 57 63 61 83 155 199 208 209 217 221 223 224 223 220 217 212 207 199 186 151 97 60 46 58 76 70 70 54 56 53 60 75 84 82 72 61 45 24 22 26 29 39 66 23 24 24 24 24 24 23 30 63 93 108 94 63 30 25 31 51 80 76 59 52 60 58 70 147 198 205 207 214 220 222 224 225 222 217 214 209 203 195 184 160 109 64 32 41 75 72 72 60 55 56 66 83 89 85 77 74 61 32 22 26 27 36 64 24 26 26 26 24 24 23 25 46 75 86 75 44 24 30 40 75 81 72 46 55 55 54 108 184 202 209 214 219 223 223 225 225 223 220 216 211 205 193 177 161 130 83 40 36 66 76 66 67 55 54 61 72 78 75 80 80 72 44 23 24 26 34 63 27 27 28 26 24 24 23 25 43 74 89 82 50 32 35 63 80 78 54 48 45 47 65 153 188 175 170 183 198 214 217 220 221 221 222 217 213 209 197 178 165 152 109 46 45 53 81 63 66 59 54 56 53 59 66 72 75 69 42 20 25 26 32 61 28 31 29 25 24 25 25 30 56 90 109 102 73 47 47 71 78 72 43 35 33 42 123 178 169 177 163 137 123 139 176 199 209 211 212 210 206 202 198 192 179 160 111 45 53 55 77 67 63 64 57 50 49 50 57 59 58 49 34 23 26 27 33 61 52 53 35 27 27 27 27 35 64 103 124 115 99 94 83 83 77 63 33 18 22 71 167 189 193 184 172 165 143 126 138 171 197 199 190 174 148 128 119 125 134 143 109 48 61 61 67 66 60 62 56 48 51 50 56 55 50 51 50 39 27 29 34 61 86 84 58 27 25 29 29 37 77 123 129 124 116 118 101 81 73 53 35 18 30 117 188 197 178 168 179 188 172 157 146 167 191 187 167 139 121 118 128 141 142 106 96 55 64 63 62 67 62 60 50 52 49 58 67 69 71 76 71 62 34 29 34 62 94 91 65 32 27 28 28 53 113 146 133 110 101 107 95 80 72 46 46 23 48 149 198 202 169 132 117 107 139 161 156 173 191 182 142 129 135 152 156 135 137 133 94 61 67 65 59 69 59 61 50 51 49 60 73 94 104 99 75 60 34 28 33 61 84 79 50 29 27 27 28 68 141 161 138 79 70 85 91 79 66 52 56 37 64 173 207 208 189 153 127 108 147 147 176 192 211 188 140 148 146 133 130 120 118 119 104 68 66 62 63 68 59 58 52 49 52 56 72 114 135 122 72 39 29 26 32 61 64 58 40 28 27 26 27 73 153 167 133 57 57 74 85 78 68 60 59 49 80 187 205 210 204 181 158 151 154 179 198 203 218 191 160 161 152 131 92 114 89 121 115 68 64 64 62 66 62 57 56 48 47 54 70 122 156 141 73 26 27 29 33 62 64 62 45 30 28 28 28 73 152 166 131 63 72 85 82 77 72 71 63 54 89 188 206 211 215 218 210 203 206 209 204 206 221 184 171 182 161 147 125 117 116 154 135 68 62 62 63 64 65 54 59 48 46 53 65 111 161 151 76 23 28 29 33 62 68 67 48 32 30 29 30 72 149 166 128 70 82 89 78 77 76 78 63 56 92 184 205 211 218 229 230 230 226 211 201 209 224 181 171 199 203 180 166 165 166 172 142 68 59 55 64 59 67 53 55 52 46 53 62 99 157 154 80 25 29 29 33 61 70 69 51 36 33 29 27 68 143 160 124 71 89 93 82 83 81 78 54 61 97 176 203 212 215 223 225 220 210 202 200 218 228 191 167 198 218 220 213 206 193 179 143 69 49 49 60 61 66 53 55 58 50 48 60 87 148 151 80 25 28 29 33 61 72 67 52 47 42 33 27 65 139 151 116 76 92 92 83 84 82 80 59 62 101 170 197 208 206 206 204 190 187 200 204 219 221 198 168 173 202 218 220 207 192 170 132 61 35 45 58 60 61 50 53 56 52 45 57 87 131 148 78 25 29 30 33 62 82 72 57 56 52 42 28 60 132 146 115 84 94 89 83 85 85 80 63 75 115 163 185 192 190 186 179 174 190 178 167 188 193 169 153 163 164 192 205 200 185 162 116 45 32 42 58 60 56 47 52 55 49 43 57 92 121 141 78 26 27 32 36 63 84 71 65 68 62 53 39 65 130 143 116 90 92 85 82 85 86 81 68 76 79 146 176 169 172 170 166 189 202 172 138 141 139 109 120 162 153 156 181 180 168 145 95 34 35 45 55 66 54 50 52 51 52 44 54 91 107 125 75 38 42 54 59 78 99 84 86 96 94 89 76 87 126 129 114 94 92 84 88 88 86 82 71 86 41 134 176 162 168 163 170 200 208 194 175 143 134 131 142 168 168 138 156 162 148 125 73 39 40 49 62 65 54 53 47 52 50 43 51 75 89 98 68 53 66 82 89 98 108 93 102 119 125 121 107 99 117 119 111 99 90 84 87 87 84 81 76 141 88 117 180 182 180 149 130 162 167 164 161 157 144 137 142 155 160 149 136 141 130 109 55 44 54 54 61 68 57 51 52 53 48 41 48 63 73 71 62 63 79 94 103 109 111 96 112 129 140 136 120 97 97 100 101 99 89 83 82 86 87 80 71 78 47 86 175 188 186 174 141 137 186 195 187 181 174 167 141 107 101 137 150 148 128 97 39 54 68 62 58 68 64 55 55 53 46 42 43 54 60 61 60 65 78 90 99 107 117 105 113 128 137 136 117 88 81 89 94 99 89 83 78 85 87 79 75 57 38 49 148 179 188 192 199 182 198 222 233 230 227 210 182 140 147 181 180 154 126 82 31 60 80 67 65 66 73 60 56 56 48 42 41 47 56 59 62 64 73 81 86 98 114 106 115 120 125 128 109 90 92 103 103 99 89 80 76 84 87 81 77 58 43 39 97 168 186 201 206 200 184 184 194 193 193 175 142 146 185 187 167 138 117 50 36 63 81 70 67 69 78 68 57 55 52 47 43 46 59 62 59 56 60 63 66 80 133 122 112 112 115 118 112 108 126 146 132 105 90 78 76 84 87 79 75 63 41 41 44 130 182 201 204 203 183 160 155 160 155 141 141 169 188 175 143 124 77 28 45 60 76 72 73 65 77 78 61 56 55 54 48 48 59 65 63 49 39 40 40 57 162 149 124 109 109 126 143 156 169 179 161 116 91 74 72 84 85 78 72 65 46 41 36 62 152 195 210 212 197 180 172 164 158 157 170 182 178 150 126 89 36 39 53 60 66 70 75 64 72 79 65 60 55 52 52 49 59 66 62 44 27 25 25 41 174 166 146 120 112 147 176 189 181 187 163 118 89 72 70 76 79 73 68 61 49 45 37 32 92 159 199 210 214 213 212 204 197 191 192 183 155 123 90 43 36 57 62 57 55 65 67 60 70 78 69 61 59 54 57 51 59 59 53 40 27 25 27 46 160 168 174 164 161 174 179 189 179 173 140 104 81 72 66 64 76 69 60 56 48 47 39 33 75 109 157 189 196 202 207 209 205 200 186 154 108 80 48 31 61 74 66 58 54 59 62 58 71 74 71 65 64 58 56 56 54 50 38 28 25 28 46 66 144 174 201 203 199 191 172 165 160 151 121 92 75 69 62 55 69 63 54 54 44 49 39 33 67 107 113 139 162 168 170 177 180 170 142 98 66 52 42 47 77 79 64 69 65 60 66 52 71 71 74 68 63 64 60 60 52 43 28 24 29 47 74 90 144 183 214 213 203 190 161 153 150 151 112 81 69 64 57 52 60 56 48 46 37 44 39 30 57 113 119 113 118 119 116 117 118 104 81 68 68 65 46 69 82 75 66 84 73 65 64 44 71 72 75 70 61 68 70 62 56 44 31 74 110 117 143 156 136 176 204 199 192 186 169 151 150 150 97 73 65 59 53 51 58 51 41 42 35 30 37 32 45 111 126 121 117 112 103 91 82 81 84 90 89 67 61 82 74 66 76 92 74 70 66 38 69 69 72 70 53 67 76 73 64 52 52 150 189 162 194 201 117 151 174 174 181 184 184 161 153 131 84 65 64 55 50 50 56 47 37 37 32 24 27 29 36 108 134 132 128 125 116 106 97 97 99 101 87 63 79 76 68 67 79 86 70 74 62 39 70 70 70 69 54 67 78 73 76 68 82 186 218 183 203 211 122 141 158 155 159 166 184 170 153 118 83 66 59 48 46 46 51 42 36 37 28 22 19 25 31 103 136 144 142 136 129 119 111 109 112 109 84 78 86 74 63 75 82 78 66 72 51 41 68 68 63 62 48 65 75 85 93 94 113 193 227 213 208 220 152 154 161 159 152 174 184 168 148 115 81 63 52 43 42 47 42 38 34 35 28 19 19 20 26 94 135 151 152 147 142 134 126 122 124 115 92 86 81 67 65 77 79 69 69 66 36 47 64 63 56 44 40 60 87 115 125 110 115 176 219 218 204 215 156 146 149 165 186 198 187 163 135 107 74 55 45 40 39 42 36 37 33 32 24 16 18 17 36 98 132 153 158 154 149 140 134 131 132 116 97 88 75 67 70 73 73 70 63 48 30 51 56 58 41 30 42 64 110 139 170 170 147 131 182 202 204 208 122 116 112 144 194 194 166 141 115 94 64 50 43 39 38 35 37 33 28 27 21 18 17 21 65 126 133 151 157 154 149 143 138 138 135 121 101 89 68 71 72 68 65 63 51 29 32 49 53 50 31 29 43 79 108 149 165 182 173 152 136 178 211 214 119 119 123 138 179 171 131 105 88 71 55 39 39 40 29 34 32 31 30 26 18 15 21 32 104 153 157 149 162 157 153 149 146 149 148 139 104 87 68 69 68 63 58 44 37 25 32 49 45 31 27 32 48 90 121 137 177 173 181 175 155 139 196 205 182 184 184 182 183 170 141 96 73 62 51 36 36 34 26 30 32 34 28 21 21 23 32 42 121 174 178 162 170 162 164 168 168 164 144 111 85 84 71 71 65 57 41 29 27 23 27 42 31 31 30 42 59 92 132 150 173 172 174 181 171 143 149 173 196 197 193 187 182 171 152 113 94 66 46 39 36 36 37 40 47 53 63 78 56 39 43 46 84 147 169 175 185 173 174 163 136 94 57 60 81 78 71 71 59 44 28 26 30 28 31 35 27 38 44 55 77 92 131 156 174 167 169 181 176 160 129 159 192 191 187 182 179 171 154 127 91 67 51 74 100 114 111 106 102 110 119 105 52 49 49 51 53 57 68 87 96 89 76 59 50 49 53 61 81 73 65 59 42 43 30 32 32 35 35 33 35 46 57 60 79 100 128 165 172 175 166 175 180 165 144 158 184 185 183 183 182 179 155 130 81 99 125 139 150 149 141 149 149 169 172 112 50 49 52 51 54 54 52 52 51 50 50 49 50 54 57 65 79 67 65 51 49 60 42 45 42 40 35 38 41 54 67 71 81 116 123 165 182 179 164 172 182 167 143 149 184 186 186 185 184 176 146 125 150 180 187 191 185 179 177 181 184 198 195 111 52 54 56 54 54 55 54 55 54 53 53 53 55 59 56 67 77 69 60 56 61 71 57 57 48 44 47 47 53 62 77 78 79 124 135 162 184 184 170 166 183 173 148 130 183 184 184 184 181 164 160 182 197 201 205 208 210 208 201 197 202 206 192 70 50 53 56 57 57 56 59 59 57 55 55 55 56 59 51 67 76 65 58 64 69 69 71 61 56 52 47 55 68 67 79 83 81 116 142 155 180 184 176 161 183 177 157 140 182 183 184 184 179 180 198 203 208 211 213 214 217 219 215 211 214 211 158 55 52 53 57 54 52 52 56 58 57 52 54 54 56 56 57 64 69 61 56 67 65 67 67 65 60 56 58 67 74 74 77 83 82 102 144 151 183 184 181 158 182 181 164 150 182 184 186 185 195 206 209 212 215 216 217 218 223 225 224 222 220 208 107 56 53 54 57 57 53 52 53 56 59 55 56 56 56 54 62 64 64 56 58 68 66 63 71 64 57 54 64 67 76 82 76 84 88 90 139 144 184 186 181 159 181 184 173 160 182 185 186 198 212 214 218 218 219 220 222 223 223 225 224 226 224 198 69 53 54 56 59 59 51 53 54 56 57 58 54 55 54 54 66 64 62 59 62 67 57 67 75 65 55 59 66 71 72 80 81 82 91 85 125 141 186 188 175 164 179 186 177 162" -print(len(first.split()), len(second.split())) - -i = 0 -hasdone = False -for f, s in zip(first.split(), second.split()): - i+=1 - if f != s: - if not hasdone: - print("------------------------------------", i) - hasdone = True - print("- '" + f + "'", "'" + s + "'", end=" | ") diff --git a/misc/explore.py b/misc/explore.py index 1ecdf26..c9d4416 100644 --- a/misc/explore.py +++ b/misc/explore.py @@ -36,13 +36,13 @@ def execute(config): out, err = process.communicate() def mutate(config): - row_strides = [8, 16, 64, 256] - col_strides = [8, 16, 64, 256] + row_strides = [128, 256] + col_strides = [128, 256] rows = [64, 128, 256] cols = [64, 128, 256] clock_rates = [10000000] - array_rows = [256, 350, 500] - array_cols = [8, 16, 32] + array_rows = [128, 256] + array_cols = [90, 100] possibilities = list(itertools.product(row_strides,col_strides,rows, cols, clock_rates, array_rows, array_cols)) print("Combinations:", len(possibilities)) diff --git a/scamp5/analognet2/analog_main.cpp b/scamp5/analognet2/analog_main.cpp index 38fc047..b8cffc8 100644 --- a/scamp5/analognet2/analog_main.cpp +++ b/scamp5/analognet2/analog_main.cpp @@ -23,6 +23,63 @@ #include "conv_instructions.h" #include "fc_weights.h" + +SCAMP5 setup() { + int rows = 256; + int cols = 256; + + std::shared_ptr config = std::make_shared(1e7); + std::shared_ptr pixel = std::make_shared(rows, cols, 1, 1, LIVE, "", config); + + std::unordered_map> analogue_registers; + analogue_registers["PIX"] = std::make_shared(rows, cols, config); + analogue_registers["IN"] = std::make_shared(rows, cols, config); + analogue_registers["NEWS"] = std::make_shared(rows, cols, config); + analogue_registers["A"] = std::make_shared(rows, cols, config); + analogue_registers["B"] = std::make_shared(rows, cols, config); + analogue_registers["C"] = std::make_shared(rows, cols, config); + analogue_registers["D"] = std::make_shared(rows, cols, config); + analogue_registers["E"] = std::make_shared(rows, cols, config); + analogue_registers["F"] = std::make_shared(rows, cols, config); + + std::unordered_map> digital_registers; + digital_registers["FLAG"] = std::make_shared(rows, cols, config); + digital_registers["SELECT"] = std::make_shared(rows, cols, config); + digital_registers["RECT"] = std::make_shared(rows, cols, config); + digital_registers["R0"] = std::make_shared(rows, cols, config); + digital_registers["R1"] = std::make_shared(rows, cols, config); + digital_registers["R2"] = std::make_shared(rows, cols, config); + digital_registers["R3"] = std::make_shared(rows, cols, config); + digital_registers["R4"] = std::make_shared(rows, cols, config); + digital_registers["R5"] = std::make_shared(rows, cols, config); + digital_registers["R6"] = std::make_shared(rows, cols, config); + digital_registers["R7"] = std::make_shared(rows, cols, config); + digital_registers["R8"] = std::make_shared(rows, cols, config); + digital_registers["R9"] = std::make_shared(rows, cols, config); + digital_registers["R10"] = std::make_shared(rows, cols, config); + digital_registers["R11"] = std::make_shared(rows, cols, config); + digital_registers["R12"] = std::make_shared(rows, cols, config); + + + std::shared_ptr pe = std::make_shared(); + pe->set_rows(rows); + pe->set_cols(cols); + pe->set_config(config); + pe->set_pixel(pixel); + pe->set_analogue_registers(analogue_registers); + pe->set_digital_registers(digital_registers); + + SCAMP5 s = SCAMP5(); + s.set_rows(rows); + s.set_cols(cols); + s.set_origin(Origin::TOP_RIGHT); + s.add_component("pe", pe); + s.init(); + + return s; +} + + // Given SCAN_SIZE coordinates, // group them into 12 bins, filling the count array // central division, with overlaping bins @@ -86,8 +143,11 @@ void update(UI& ui, const std::vector>& reg) { } int analog_main() { - SCAMP5 s = SCAMP5(); + std::cout << "started"<< std::endl; + SCAMP5 s = setup(); + std::cout << "setup" << std::endl; UI ui = UI::get_instance(); + std::cout << "got ui" << std::endl; int threshold_value; int t1_value, t2_value, t3_value; @@ -159,6 +219,7 @@ int analog_main() { int index = 0; // Frame Loop while(1) { + int e1 = cv::getTickCount(); // vs_process_message(); std::fill(std::begin(coordinates), std::end(coordinates), 0); std::fill(std::begin(conv_outputs), std::end(conv_outputs), 0); @@ -276,6 +337,9 @@ int analog_main() { if(fc2_result[i] > fc2_result[max_index]) max_index = i; } + int e2 = cv::getTickCount(); + double frame_ms = ((double)(e2 - e1) / cv::getTickFrequency()) * 1000; + std::cout << frame_ms << " ms" << std::endl; std::cout << "Predicted: " << (int)max_index << std::endl; // if ((int) max_index == 9) { diff --git a/scamp5/scamp5.cpp b/scamp5/scamp5.cpp index 0bc3125..a09a1c8 100644 --- a/scamp5/scamp5.cpp +++ b/scamp5/scamp5.cpp @@ -6,7 +6,7 @@ #include #include - +#include #include #include #include @@ -16,6 +16,14 @@ #include "simulator/external/parser.h" + +using Clock = std::chrono::steady_clock; +using std::chrono::time_point; +using std::chrono::duration_cast; +using std::chrono::milliseconds; +using std::chrono::nanoseconds; +using namespace std::literals::chrono_literals; + void SCAMP5::init() { // Registers used often in instructions pe = this->get_component("pe"); @@ -54,6 +62,11 @@ void SCAMP5::init() { intermediate_a = std::make_shared(this->rows_, this->cols_); intermediate_a2 = std::make_shared(this->rows_, this->cols_); intermediate_d = std::make_shared(this->rows_, this->cols_); + + east_ptr_scratch = std::make_shared(this->rows_, this->cols_); + west_ptr_scratch = std::make_shared(this->rows_, this->cols_); + north_ptr_scratch = std::make_shared(this->rows_, this->cols_); + south_ptr_scratch = std::make_shared(this->rows_, this->cols_); } void SCAMP5::nop() { this->update_cycles(1); } @@ -85,6 +98,16 @@ void SCAMP5::get_image(const std::shared_ptr& y, const std::shared_ptrbus(y, h, NEWS, PIX); } +void SCAMP5::downsample(const std::shared_ptr& dst, const std::shared_ptr& src) { + cv::setNumThreads(1); + time_point start = Clock::now(); + cv::resize(src->read(), dst->read(), cv::Size(), 0.5, 0.5, cv::INTER_NEAREST); + time_point end = Clock::now(); + nanoseconds diff = duration_cast(end - start); + std::cout << diff.count() << " ns to downsample" << std::endl; + +} + void SCAMP5::respix() { // reset *PIX this->rpix(); @@ -716,87 +739,256 @@ void SCAMP5::XOR(const std::shared_ptr& Rl, const std::shared_ptr& R void SCAMP5::WHERE(const std::shared_ptr& d) { // FLAG := d. +#ifdef USE_CUDA + if (FLAG->get_mask().empty()) { + this->FLAG->write(d->read()); + } else { + this->FLAG->write(d->read(), FLAG->get_mask()); + } +#else this->FLAG->write(d->read(), FLAG->get_mask()); +#endif this->update_cycles(2); // 1 read, 1 write } void SCAMP5::WHERE(const std::shared_ptr& d0, const std::shared_ptr& d1) { // FLAG := d0 OR d1. this->OR(intermediate_d, d0, d1); +#ifdef USE_CUDA + if (FLAG->get_mask().empty()) { + this->FLAG->write(intermediate_d->read()); + } else { + this->FLAG->write(intermediate_d->read(), FLAG->get_mask()); + } +#else this->FLAG->write(intermediate_d->read(), FLAG->get_mask()); +#endif this->update_cycles(1); // 1 write } void SCAMP5::WHERE(const std::shared_ptr& d0, const std::shared_ptr& d1, const std::shared_ptr& d2) { // FLAG := d0 OR d1 OR d2. this->OR(intermediate_d, d0, d1, d2); +#ifdef USE_CUDA + if (FLAG->get_mask().empty()) { + this->FLAG->write(intermediate_d->read()); + } else { + this->FLAG->write(intermediate_d->read(), FLAG->get_mask()); + } +#else this->FLAG->write(intermediate_d->read(), FLAG->get_mask()); +#endif this->update_cycles(1); // 1 write } void SCAMP5::ALL() { // FLAG := 1, same as all. +#ifdef USE_CUDA + if (FLAG->get_mask().empty()) { + this->FLAG->write(1); + } else { + this->FLAG->write(1, FLAG->get_mask()); + } +#else this->FLAG->write(1, FLAG->get_mask()); +#endif this->update_cycles(1); // 1 write } void SCAMP5::SET(const std::shared_ptr& d0) { // d0 := 1 +#ifdef USE_CUDA + if (d0->get_mask().empty()) { + d0->write(1); + } else { + d0->write(1, d0->get_mask()); + } +#else d0->write(1, d0->get_mask()); +#endif this->update_cycles(1); // 1 write } void SCAMP5::SET(const std::shared_ptr& d0, const std::shared_ptr& d1) { // d0, d1 := 1 +#ifdef USE_CUDA + if (d0->get_mask().empty()) { + d0->write(1); + } else { + d0->write(1, d0->get_mask()); + } + + if (d1->get_mask().empty()) { + d1->write(1); + } else { + d1->write(1, d1->get_mask()); + } +#else d0->write(1, d0->get_mask()); d1->write(1, d1->get_mask()); +#endif this->update_cycles(2); // 2 writes } void SCAMP5::SET(const std::shared_ptr& d0, const std::shared_ptr& d1, const std::shared_ptr& d2) { // d0, d1, d2 := 1 +#ifdef USE_CUDA + if (d0->get_mask().empty()) { + d0->write(1); + } else { + d0->write(1, d0->get_mask()); + } + + if (d1->get_mask().empty()) { + d1->write(1); + } else { + d1->write(1, d1->get_mask()); + } + + if (d2->get_mask().empty()) { + d2->write(1); + } else { + d2->write(1, d2->get_mask()); + } +#else d0->write(1, d0->get_mask()); d1->write(1, d1->get_mask()); d2->write(1, d2->get_mask()); +#endif this->update_cycles(3); // 3 writes } void SCAMP5::SET(const std::shared_ptr& d0, const std::shared_ptr& d1, const std::shared_ptr& d2, const std::shared_ptr& d3) { // d0, d1, d2, d3 := 1 +#ifdef USE_CUDA + if (d0->get_mask().empty()) { + d0->write(1); + } else { + d0->write(1, d0->get_mask()); + } + + if (d1->get_mask().empty()) { + d1->write(1); + } else { + d1->write(1, d1->get_mask()); + } + + if (d2->get_mask().empty()) { + d2->write(1); + } else { + d2->write(1, d2->get_mask()); + } + + if (d3->get_mask().empty()) { + d3->write(1); + } else { + d3->write(1, d3->get_mask()); + } +#else d0->write(1, d0->get_mask()); d1->write(1, d1->get_mask()); d2->write(1, d2->get_mask()); d3->write(1, d3->get_mask()); +#endif this->update_cycles(4); // 4 writes } void SCAMP5::CLR(const std::shared_ptr& d0) { // d0 := 0 +#ifdef USE_CUDA + if (d0->get_mask().empty()) { + d0->write(0); + } else { + d0->write(0, d0->get_mask()); + } +#else d0->write(0, d0->get_mask()); +#endif this->update_cycles(1); // 1 write } void SCAMP5::CLR(const std::shared_ptr& d0, const std::shared_ptr& d1) { // d0, d1 := 0 +#ifdef USE_CUDA + if (d0->get_mask().empty()) { + d0->write(0); + } else { + d0->write(0, d0->get_mask()); + } + + if (d1->get_mask().empty()) { + d1->write(0); + } else { + d1->write(0, d1->get_mask()); + } +#else d0->write(0, d0->get_mask()); d1->write(0, d1->get_mask()); +#endif + this->update_cycles(2); // 2 writes } void SCAMP5::CLR(const std::shared_ptr& d0, const std::shared_ptr& d1, const std::shared_ptr& d2) { // d0, d1, d2 := 0 +#ifdef USE_CUDA + if (d0->get_mask().empty()) { + d0->write(0); + } else { + d0->write(0, d0->get_mask()); + } + + if (d1->get_mask().empty()) { + d1->write(0); + } else { + d1->write(0, d1->get_mask()); + } + + if (d2->get_mask().empty()) { + d2->write(0); + } else { + d2->write(0, d2->get_mask()); + } +#else d0->write(0, d0->get_mask()); d1->write(0, d1->get_mask()); d2->write(0, d2->get_mask()); +#endif this->update_cycles(3); // 3 writes } void SCAMP5::CLR(const std::shared_ptr& d0, const std::shared_ptr& d1, const std::shared_ptr& d2, const std::shared_ptr& d3) { // d0, d1, d2, d3 := 0 +#ifdef USE_CUDA + if (d0->get_mask().empty()) { + d0->write(0); + } else { + d0->write(0, d0->get_mask()); + } + + if (d1->get_mask().empty()) { + d1->write(0); + } else { + d1->write(0, d1->get_mask()); + } + + if (d2->get_mask().empty()) { + d2->write(0); + } else { + d2->write(0, d2->get_mask()); + } + + if (d3->get_mask().empty()) { + d3->write(0); + } else { + d3->write(0, d3->get_mask()); + } +#else d0->write(0, d0->get_mask()); d1->write(0, d1->get_mask()); d2->write(0, d2->get_mask()); d3->write(0, d3->get_mask()); +#endif this->update_cycles(4); // 4 writes } @@ -826,53 +1018,36 @@ void SCAMP5::REFRESH(const std::shared_ptr& Rl) { void SCAMP5::DNEWS0(const std::shared_ptr& d, const std::shared_ptr& d0) { // d := d0_dir, direction selected by R1, R2, R3, R4 // Reads 0 from the edge - DREG east = DREG(this->rows_, this->cols_); - DREG north = DREG(this->rows_, this->cols_); - DREG west = DREG(this->rows_, this->cols_); - DREG south = DREG(this->rows_, this->cols_); - this->pe->local_read_bus.get_east(east, *d0, 1, 0, this->origin_); - this->pe->local_read_bus.get_north(north, *d0, 1, 0, this->origin_); - this->pe->local_read_bus.get_west(west, *d0, 1, 0, this->origin_); - this->pe->local_read_bus.get_south(south, *d0, 1, 0, this->origin_); + this->pe->local_read_bus.get_east(*east_ptr_scratch, *d0, 1, 0, this->origin_); + this->pe->local_read_bus.get_north(*north_ptr_scratch, *d0, 1, 0, this->origin_); + this->pe->local_read_bus.get_west(*west_ptr_scratch, *d0, 1, 0, this->origin_); + this->pe->local_read_bus.get_south(*south_ptr_scratch, *d0, 1, 0, this->origin_); - std::shared_ptr east_ptr = std::make_shared(east); - std::shared_ptr west_ptr = std::make_shared(west); - std::shared_ptr north_ptr = std::make_shared(north); - std::shared_ptr south_ptr = std::make_shared(south); + AND(east_ptr_scratch, east_ptr_scratch, RE); + AND(north_ptr_scratch, north_ptr_scratch, RN); + AND(west_ptr_scratch, west_ptr_scratch, RW); + AND(south_ptr_scratch, south_ptr_scratch, RS); - AND(east_ptr, east_ptr, RE); - AND(north_ptr, north_ptr, RN); - AND(west_ptr, west_ptr, RW); - AND(south_ptr, south_ptr, RS); - - OR(d, east_ptr, north_ptr, south_ptr, west_ptr); + OR(d, east_ptr_scratch, north_ptr_scratch, south_ptr_scratch, west_ptr_scratch); } void SCAMP5::DNEWS1(const std::shared_ptr& d, const std::shared_ptr& d0) { // d := d0_dir, direction selected by R1, R2, R3, R4 // Reads 1 from the edge - DREG east = DREG(this->rows_, this->cols_); - DREG north = DREG(this->rows_, this->cols_); - DREG west = DREG(this->rows_, this->cols_); - DREG south = DREG(this->rows_, this->cols_); - this->pe->local_read_bus.get_east(east, *d0, 1, 1, this->origin_); - this->pe->local_read_bus.get_north(north, *d0, 1, 1, this->origin_); - this->pe->local_read_bus.get_west(west, *d0, 1, 1, this->origin_); - this->pe->local_read_bus.get_south(south, *d0, 1, 1, this->origin_); + this->pe->local_read_bus.get_east(*east_ptr_scratch, *d0, 1, 1, this->origin_); + this->pe->local_read_bus.get_north(*north_ptr_scratch, *d0, 1, 1, this->origin_); + this->pe->local_read_bus.get_west(*west_ptr_scratch, *d0, 1, 1, this->origin_); + this->pe->local_read_bus.get_south(*north_ptr_scratch, *d0, 1, 1, this->origin_); - std::shared_ptr east_ptr = std::make_shared(east); - std::shared_ptr west_ptr = std::make_shared(west); - std::shared_ptr north_ptr = std::make_shared(north); - std::shared_ptr south_ptr = std::make_shared(south); + AND(east_ptr_scratch, east_ptr_scratch, RE); + AND(north_ptr_scratch, north_ptr_scratch, RN); + AND(west_ptr_scratch, west_ptr_scratch, RW); + AND(south_ptr_scratch, south_ptr_scratch, RS); - AND(east_ptr, east_ptr, RE); - AND(north_ptr, north_ptr, RN); - AND(west_ptr, west_ptr, RW); - AND(south_ptr, south_ptr, RS); + OR(d, east_ptr_scratch, north_ptr_scratch, south_ptr_scratch, west_ptr_scratch); - OR(d, east_ptr, north_ptr, south_ptr, west_ptr); } void SCAMP5::DNEWS(const std::shared_ptr& Ra, const std::shared_ptr& Rx, int dir, bool boundary) { @@ -1089,7 +1264,13 @@ void SCAMP5::scamp5_diffuse(const std::shared_ptr& target, int iterations, uint8_t SCAMP5::scamp5_read_areg(const std::shared_ptr& areg, uint8_t r, uint8_t c) { // read a single pixel // TODO check that the value is properly mapped to uint8_t from CV_16U - return areg->read().at(r, c); + cv::Mat m; +#ifdef USE_CUDA + areg->read().download(m); +#else + m = areg->read().getMat(cv::ACCESS_READ); +#endif + return m.at(r, c); } uint32_t SCAMP5::scamp5_global_sum_16(const std::shared_ptr& areg, uint8_t *result16v) { @@ -1164,10 +1345,17 @@ uint8_t SCAMP5::scamp5_global_sum_sparse(const std::shared_ptr& areg, uint uint8_t sum = 0; + cv::Mat m; +#ifdef USE_CUDA + areg->read().download(m); +#else + m = areg->read().getMat(cv::ACCESS_READ); +#endif + for(unsigned int row_index = 0; row_index < this->cols_; row_index++) { for(unsigned int col_index = 0; col_index < this->rows_; col_index++) { if(((row_index & r_mask) == r_f) && ((col_index & c_mask) == c_f)) { - sum += areg->read().at(row_index, col_index); + sum += m.at(row_index, col_index); } } } @@ -1192,12 +1380,19 @@ int SCAMP5::scamp5_global_or(const std::shared_ptr& dreg, uint8_t r, uint8 unsigned int r_f = r & r_mask; unsigned int c_f = c & c_mask; + cv::Mat m; +#ifdef USE_CUDA + dreg->read().download(m); +#else + m = dreg->read().getMat(cv::ACCESS_READ); +#endif + uint8_t val = 0; for(unsigned int row_index = 0; row_index < this->cols_; row_index++) { for(unsigned int col_index = 0; col_index < this->rows_; col_index++) { if(((row_index & r_mask) == r_f) && ((col_index & c_mask) == c_f)) { - val |= dreg->read().at(row_index, col_index); + val |= m.at(row_index, col_index); } } } @@ -1245,7 +1440,7 @@ void SCAMP5::scamp5_flood(const std::shared_ptr& dreg_target, const std::s cv::copyMakeBorder(dreg_mask->read(), mask, 1, 1, 1, 1, cv::BORDER_REPLICATE); - dreg_mask->read() = 1 - dreg_mask->read(); + cv::subtract(dreg_mask->read(), 1, dreg_mask->read()); for(auto &seed: seeds) { cv::floodFill(dreg_mask->read(), mask, seed, cv::Scalar(1), nullptr, @@ -1256,7 +1451,15 @@ void SCAMP5::scamp5_flood(const std::shared_ptr& dreg_target, const std::s void SCAMP5::scamp5_load_point(const std::shared_ptr& dr, uint8_t r, uint8_t c) { // set a single pixel on a DREG image to 1, the rest to 0 dr->read().setTo(0); - dr->read().at(r, c) = 1; +#ifdef USE_CUDA + cv::Mat m; + dr->read().download(m); + m.at(r, c) = 1; + dr->read().upload(m); +#else + dr->read().getMat(cv::ACCESS_READ).at(r, c) = 1; +#endif + } void SCAMP5::scamp5_load_rect(const std::shared_ptr& dr, uint8_t r0, uint8_t c0, uint8_t r1, @@ -1297,13 +1500,24 @@ void SCAMP5::scamp5_load_pattern(const std::shared_ptr& dr, uint8_t r, uin unsigned int r_f = r * r_mask; unsigned int c_f = c * c_mask; - for(unsigned int row_index = 0; row_index < this->cols_; row_index++) { - for(unsigned int col_index = 0; col_index < this->rows_; col_index++) { + cv::Mat m; +#ifdef USE_CUDA + dr->read().download(m); +#else + m = dr->read().getMat(cv::ACCESS_WRITE); +#endif + + for(int row_index = 0; row_index < this->cols_; row_index++) { + for(int col_index = 0; col_index < this->rows_; col_index++) { if(((row_index * r_mask) == r_f) && ((col_index * c_mask) == c_f)) { - dr->read().at(row_index, col_index) = 1; + m.at(row_index, col_index) = 1; } } } + +#ifdef USE_CUDA + dr->read().upload(m); +#endif } void SCAMP5::scamp5_select_point(uint8_t r, uint8_t c) { @@ -1328,24 +1542,42 @@ void SCAMP5::scamp5_select_pattern(uint8_t r, uint8_t c, uint8_t rx, void SCAMP5::scamp5_select_col(uint8_t c) { // select column + cv::Mat m; +#ifdef USE_CUDA + SELECT->read().download(m); +#else + m = SELECT->read().getMat(cv::ACCESS_WRITE); +#endif for(unsigned int row_index = 0; row_index < this->cols_; row_index++) { for(unsigned int col_index = 0; col_index < this->rows_; col_index++) { if(col_index == c) { - SELECT->read().at(row_index, col_index) = 1; + m.at(row_index, col_index) = 1; } } } +#ifdef USE_CUDA + SELECT->read().upload(m); +#endif } void SCAMP5::scamp5_select_row(uint8_t r) { // select row + cv::Mat m; +#ifdef USE_CUDA + SELECT->read().download(m); +#else + m = SELECT->read().getMat(cv::ACCESS_WRITE); +#endif for(unsigned int row_index = 0; row_index < this->cols_; row_index++) { for(unsigned int col_index = 0; col_index < this->rows_; col_index++) { if(row_index == r) { - SELECT->read().at(row_index, col_index) = 1; + m.at(row_index, col_index) = 1; } } } +#ifdef USE_CUDA + SELECT->read().upload(m); +#endif } void SCAMP5::scamp5_select_colx(uint8_t cx) { @@ -1368,7 +1600,17 @@ void SCAMP5::scamp5_draw_end() { void SCAMP5::scamp5_draw_pixel(uint8_t r, uint8_t c) { // draw a point, wrap around if it's outside the border - scratch->read().at(r % this->rows_, c % this->cols_) = 1; + cv::Mat m; +#ifdef USE_CUDA + scratch->read().download(m); +#else + m = scratch->read().getMat(cv::ACCESS_WRITE); +#endif + m.at(r % this->rows_, c % this->cols_) = 1; + +#ifdef USE_CUDA + scratch->read().upload(m); +#endif } bool SCAMP5::scamp5_draw_point(int r, int c) { @@ -1377,7 +1619,16 @@ bool SCAMP5::scamp5_draw_point(int r, int c) { if(r >= this->rows_ || c >= this->cols_) { return false; } - scratch->read().at(r, c) = 1; + cv::Mat m; +#ifdef USE_CUDA + scratch->read().download(m); +#else + m = scratch->read().getMat(cv::ACCESS_WRITE); +#endif + m.at(r, c) = 1; +#ifdef USE_CUDA + scratch->read().upload(m); +#endif return true; } @@ -1421,10 +1672,17 @@ void SCAMP5::scamp5_draw_circle(int x0, int y0, int radius, bool repeat) { int x = 0; int y = radius; - scratch->read().at(y0 + radius, x0) = 1; - scratch->read().at(y0 - radius, x0) = 1; - scratch->read().at(y0, x0 + radius) = 1; - scratch->read().at(y0, x0 - radius) = 1; + cv::Mat m; +#ifdef USE_CUDA + scratch->read().download(m); +#else + m = scratch->read().getMat(cv::ACCESS_WRITE); +#endif + + m.at(y0 + radius, x0) = 1; + m.at(y0 - radius, x0) = 1; + m.at(y0, x0 + radius) = 1; + m.at(y0, x0 - radius) = 1; while(x < y) { if(f >= 0) { @@ -1437,21 +1695,24 @@ void SCAMP5::scamp5_draw_circle(int x0, int y0, int radius, bool repeat) { ddf_x += 2; f += ddf_x; - scratch->read().at(y0 + y, x0 + x) = 1; - scratch->read().at(y0 + y, x0 - x) = 1; - scratch->read().at(y0 - y, x0 + x) = 1; - scratch->read().at(y0 - y, x0 - x) = 1; - scratch->read().at(y0 + x, x0 + y) = 1; - scratch->read().at(y0 + x, x0 - y) = 1; - scratch->read().at(y0 - x, x0 + y) = 1; - scratch->read().at(y0 - x, x0 - y) = 1; + m.at(y0 + y, x0 + x) = 1; + m.at(y0 + y, x0 - x) = 1; + m.at(y0 - y, x0 + x) = 1; + m.at(y0 - y, x0 - x) = 1; + m.at(y0 + x, x0 + y) = 1; + m.at(y0 + x, x0 - y) = 1; + m.at(y0 - x, x0 + y) = 1; + m.at(y0 - x, x0 - y) = 1; } +#ifdef USE_CUDA + scratch->read().upload(m); +#endif } void SCAMP5::scamp5_draw_negate() { // do a binary inversion of the DREG image. // TODO abstraction - scratch->read() = 1 - scratch->read(); + cv::subtract(scratch->read(), 1, scratch->read()); } // Image Readout @@ -1475,11 +1736,21 @@ void SCAMP5::scamp5_scan_areg_8x8(const std::shared_ptr& areg, uint8_t *re int buf_index = 0; int cs = this->cols_ / 8; int rs = this->rows_ / 8; + + cv::Mat m; +#ifdef USE_CUDA + areg->read().download(m); +#else + m = areg->read().getMat(cv::ACCESS_READ); +#endif for(int col = 0; col < this->cols_; col += cs) { for(int row = 0; row < this->rows_; row += rs) { - result8x8[buf_index++] = areg->read().at(row, col); + result8x8[buf_index++] = m.at(row, col); } } +#ifdef USE_CUDA + areg->read().upload(m); +#endif } void SCAMP5::scamp5_scan_areg_mean_8x8(const std::shared_ptr& areg, uint8_t *result8x8) { @@ -1496,20 +1767,27 @@ void SCAMP5::scamp5_scan_dreg(const std::shared_ptr& dreg, uint8_t *mem, u // r1 - last row index // The size of the buffer need to be a least 32 times the number of rows to // scan. Thus, a full DREG image requires a buffer of 8192 bytes. - // TODO check if it should be (row, col) or (col, row) // TODO check impl + + cv::Mat m; +#ifdef USE_CUDA + dreg->read().download(m); +#else + m = dreg->read().getMat(cv::ACCESS_READ); +#endif + int buf_index = 0; for(uint32_t row_index = r0; row_index <= r1; row_index++) { // Read 8 values at a time to make up a byte for(int col_index = 0; col_index < this->cols_; col_index += 8) { - uint8_t b0 = dreg->read().at(row_index, col_index); - uint8_t b1 = dreg->read().at(row_index, col_index + 1); - uint8_t b2 = dreg->read().at(row_index, col_index + 2); - uint8_t b3 = dreg->read().at(row_index, col_index + 3); - uint8_t b4 = dreg->read().at(row_index, col_index + 4); - uint8_t b5 = dreg->read().at(row_index, col_index + 5); - uint8_t b6 = dreg->read().at(row_index, col_index + 6); - uint8_t b7 = dreg->read().at(row_index, col_index + 7); + uint8_t b0 = m.at(row_index, col_index); + uint8_t b1 = m.at(row_index, col_index + 1); + uint8_t b2 = m.at(row_index, col_index + 2); + uint8_t b3 = m.at(row_index, col_index + 3); + uint8_t b4 = m.at(row_index, col_index + 4); + uint8_t b5 = m.at(row_index, col_index + 5); + uint8_t b6 = m.at(row_index, col_index + 6); + uint8_t b7 = m.at(row_index, col_index + 7); uint8_t value = (b0 << 7) | (b1 << 6) | (b2 << 5) | (b3 << 4) | (b4 << 3) | (b5 << 2) | (b6 << 1) | (b7 << 0); mem[buf_index++] = value; @@ -1544,11 +1822,17 @@ void SCAMP5::scamp5_scan_events(const std::shared_ptr& dreg, uint8_t *mem, void SCAMP5::scamp5_scan_events(const std::shared_ptr& dreg, uint8_t *buffer, uint16_t max_num, uint8_t r0, uint8_t c0, uint8_t r1, uint8_t c1, uint8_t rs, uint8_t cs) { + cv::Mat m; +#ifdef USE_CUDA + dreg->read().download(m); +#else + m = dreg->read().getMat(cv::ACCESS_WRITE); +#endif // assuming 0,0 in top left int buf_index = 0; for(int col = c0; col < c1; col += cs) { for(int row = r0; row < r1; row += rs) { - if(dreg->read().at(row, col) > 0) { + if(m.at(row, col) > 0) { if(buf_index == 2 * max_num) return; buffer[buf_index++] = col; @@ -1708,7 +1992,7 @@ RTTR_REGISTRATION { .method("NOR", select_overload&, const std::shared_ptr&, const std::shared_ptr&)>(&SCAMP5::NOR)) .method("NOR", select_overload&, const std::shared_ptr&, const std::shared_ptr&, const std::shared_ptr&)>(&SCAMP5::NOR)) .method("NOR", select_overload&, const std::shared_ptr&, const std::shared_ptr&, const std::shared_ptr&, const std::shared_ptr&)>(&SCAMP5::NOR)) - .method("NOR", select_overload&)>(&SCAMP5::NOT)) + .method("NOT", select_overload&)>(&SCAMP5::NOT)) .method("OR", select_overload&, const std::shared_ptr&)>(&SCAMP5::OR)) .method("NOR", select_overload&, const std::shared_ptr&)>(&SCAMP5::NOR)) .method("AND", &SCAMP5::AND) @@ -1787,5 +2071,6 @@ RTTR_REGISTRATION { .method("scamp5_scan_events", select_overload&, uint8_t*, uint16_t, uint8_t, uint8_t)>(&SCAMP5::scamp5_scan_events))(default_arguments((uint16_t)1000, (uint8_t)0, (uint8_t)0)) .method("scamp5_scan_events", select_overload&, uint8_t*, uint16_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t, uint8_t)>(&SCAMP5::scamp5_scan_events)) .method("scamp5_scan_boundingbox", &SCAMP5::scamp5_scan_boundingbox) - .method("print_stats", &SCAMP5::print_stats)(default_arguments(std::string())); + .method("print_stats", &SCAMP5::print_stats)(default_arguments(std::string())) + .method("downsample", &SCAMP5::downsample); } diff --git a/scamp5/scamp5.h b/scamp5/scamp5.h index 776637a..935e72d 100644 --- a/scamp5/scamp5.h +++ b/scamp5/scamp5.h @@ -43,6 +43,11 @@ class SCAMP5 : public Architecture { std::shared_ptr intermediate_d; std::shared_ptr scratch = nullptr; + std::shared_ptr east_ptr_scratch; + std::shared_ptr west_ptr_scratch; + std::shared_ptr north_ptr_scratch; + std::shared_ptr south_ptr_scratch; + SCAMP5() = default; void init(); rttr::variant config_converter(json& j); @@ -99,6 +104,7 @@ class SCAMP5 : public Architecture { void get_image(const std::shared_ptr& y); void get_image(const std::shared_ptr& y, const std::shared_ptr& h); + void downsample(const std::shared_ptr& dst, const std::shared_ptr& src); void respix(); diff --git a/scamp5_extended/scamp5_e.cpp b/scamp5_extended/scamp5_e.cpp index 4012acb..ecf2174 100644 --- a/scamp5_extended/scamp5_e.cpp +++ b/scamp5_extended/scamp5_e.cpp @@ -65,6 +65,22 @@ void SCAMP5E::superpixel_shift_patterns_from_bitorder(int bank, const std::share DigitalRegister R_EAST(rows, cols); DigitalRegister R_WEST(rows, cols); + cv::Mat rs; + cv::Mat rn; + cv::Mat re; + cv::Mat rw; +#ifdef USE_CUDA + R_SOUTH.read().download(rs); + R_NORTH.read().download(rn); + R_EAST.read().download(re); + R_WEST.read().download(rw); +#else + rs = R_SOUTH.read().getMat(cv::ACCESS_WRITE); + rn = R_NORTH.read().getMat(cv::ACCESS_WRITE); + re = R_EAST.read().getMat(cv::ACCESS_WRITE); + rw = R_WEST.read().getMat(cv::ACCESS_WRITE); +#endif + for (size_t row = 0; row < rows; row++) { for (size_t col = 0; col < cols; col++) { int north; @@ -88,37 +104,44 @@ void SCAMP5E::superpixel_shift_patterns_from_bitorder(int bank, const std::share if (current == north + 1) { // bigger than north if (shift_left) { - R_SOUTH.read().at(row - 0, col) = 1; + rs.at(row - 0, col) = 1; } else { - R_NORTH.read().at(row - 1, col) = 1; + rn.at(row - 1, col) = 1; } } else if (current == north - 1) { // smaller than north if (shift_left) { - R_NORTH.read().at(row - 1, col) = 1; + rn.at(row - 1, col) = 1; } else { - R_SOUTH.read().at(row, col) = 1; + rs.at(row, col) = 1; } } if (current == west + 1) { // bigger than west if (shift_left) { - R_EAST.read().at(row, col) = 1; + re.at(row, col) = 1; } else { - R_WEST.read().at(row, col - 1) = 1; + rw.at(row, col - 1) = 1; } } else if (current == west - 1) { // smaller than west if (shift_left) { - R_WEST.read().at(row, col - 1) = 1; + rw.at(row, col - 1) = 1; } else { - R_EAST.read().at(row, col) = 1; + re.at(row, col) = 1; } } } } +#ifdef USE_CUDA + R_NORTH.read().upload(rn); + R_SOUTH.read().upload(rs); + R_EAST.read().upload(re); + R_WEST.read().upload(rw); +#endif + switch (origin_) { case BOTTOM_LEFT: { RSOUTH->write(R_NORTH); @@ -185,8 +208,16 @@ void SCAMP5E::superpixel_adc(const std::shared_ptr& dst, int bank, const s position_map locations; this->superpixel_positions_from_bitorder(locations); - cv::Mat& d = dst->read(); - cv::Mat& sr = src->read(); + cv::Mat d; + cv::Mat sr; +#ifdef USE_CUDA + dst->read().download(d); + src->read().download(sr); +#else + d = dst->read().getMat(cv::ACCESS_WRITE); + sr = src->read().getMat(cv::ACCESS_RW); +#endif + parallel_for_(cv::Range(0, sr.rows * sr.cols), [&](const cv::Range& range) { for (int r = range.start; r < range.end; r++) { int row = r / sr.cols; @@ -205,6 +236,10 @@ void SCAMP5E::superpixel_adc(const std::shared_ptr& dst, int bank, const s } } }); +#ifdef USE_CUDA + dst->read().upload(d); + src->read().upload(sr); +#endif } void SCAMP5E::superpixel_dac(const std::shared_ptr& dst, int bank, const std::shared_ptr& src) { @@ -212,12 +247,19 @@ void SCAMP5E::superpixel_dac(const std::shared_ptr& dst, int bank, const s this->superpixel_positions_from_bitorder(locations); // Converts digital superpixel format image to an analogue image - cv::Mat& d = dst->read(); - cv::Mat& s = src->read(); - parallel_for_(cv::Range(0, s.rows * s.cols), [&](const cv::Range& range) { + cv::Mat d; + cv::Mat sr; +#ifdef USE_CUDA + dst->read().download(d); + src->read().download(sr); +#else + d = dst->read().getMat(cv::ACCESS_WRITE); + sr = src->read().getMat(cv::ACCESS_RW); +#endif + parallel_for_(cv::Range(0, sr.rows * sr.cols), [&](const cv::Range& range) { for (int r = range.start; r < range.end; r++) { - int row = r / s.cols; - int col = r % s.cols; + int row = r / sr.cols; + int col = r % sr.cols; if (row % superpixel_size_ != 0) continue; // Step size is superpixel_size_ if (col % superpixel_size_ != 0) continue; @@ -226,12 +268,16 @@ void SCAMP5E::superpixel_dac(const std::shared_ptr& dst, int bank, const s int8_t value = 0; for (int i = 0; i < bits_in_bank_; i++) { cv::Point relative_pos = locations.at({bank, i + 1}); // bitorder starts at 1 not 0 - int bit = s.at(relative_pos.y + row, relative_pos.x + col); + int bit = sr.at(relative_pos.y + row, relative_pos.x + col); value |= bit << i; // LSB to MSB } d(cv::Rect(col, row, superpixel_size_, superpixel_size_)) = value; } }); +#ifdef USE_CUDA + dst->read().upload(d); + src->read().upload(sr); +#endif } void SCAMP5E::superpixel_in(const std::shared_ptr& dst, int bank, int value) { @@ -245,7 +291,14 @@ void SCAMP5E::superpixel_in(const std::shared_ptr& dst, int bank, int valu bits[i] = bit; } - cv::Mat& d = dst->read(); + cv::Mat d; + +#ifdef USE_CUDA + dst->read().download(d); +#else + d = dst->read().getMat(cv::ACCESS_WRITE); +#endif + for (int col = 0; col < d.cols; col += superpixel_size_) { for (int row = 0; row < d.rows; row += superpixel_size_) { for (int i = 0; i < bits_in_bank_; i++) { @@ -264,22 +317,45 @@ void SCAMP5E::superpixel_shift(const std::shared_ptr& dst, int bank, const std::shared_ptr RSOUTH = std::make_shared(superpixel_size_, superpixel_size_); std::shared_ptr REAST = std::make_shared(superpixel_size_, superpixel_size_); std::shared_ptr RWEST = std::make_shared(superpixel_size_, superpixel_size_); + + cv::Mat rs; + cv::Mat rn; + cv::Mat re; + cv::Mat rw; +#ifdef USE_CUDA + RSOUTH->read().download(rs); + RNORTH->read().download(rn); + REAST->read().download(re); + RWEST->read().download(rw); +#else + rs = RSOUTH->read().getMat(cv::ACCESS_READ); + rn = RNORTH->read().getMat(cv::ACCESS_READ); + re = REAST->read().getMat(cv::ACCESS_READ); + rw = RWEST->read().getMat(cv::ACCESS_READ); +#endif + + superpixel_shift_patterns_from_bitorder(bank, RNORTH, RSOUTH, REAST, RWEST,shift_left); // TODO non-square superpixels? int num_of_repeats_y = rows / superpixel_size_; int num_of_repeats_x = cols / superpixel_size_; - std::shared_ptr R_NORTH = std::make_shared(cv::repeat(RNORTH->read(), num_of_repeats_y, num_of_repeats_x)); - std::shared_ptr R_SOUTH = std::make_shared(cv::repeat(RSOUTH->read(), num_of_repeats_y, num_of_repeats_x)); - std::shared_ptr R_EAST = std::make_shared(cv::repeat(REAST->read(), num_of_repeats_y, num_of_repeats_x)); - std::shared_ptr R_WEST = std::make_shared(cv::repeat(RWEST->read(), num_of_repeats_y, num_of_repeats_x)); + std::shared_ptr R_NORTH = std::make_shared(cv::repeat(rn, num_of_repeats_y, num_of_repeats_x)); + std::shared_ptr R_SOUTH = std::make_shared(cv::repeat(rs, num_of_repeats_y, num_of_repeats_x)); + std::shared_ptr R_EAST = std::make_shared(cv::repeat(re, num_of_repeats_y, num_of_repeats_x)); + std::shared_ptr R_WEST = std::make_shared(cv::repeat(rw, num_of_repeats_y, num_of_repeats_x)); superpixel_shift_block(dst, src, R_NORTH, R_SOUTH, R_EAST, R_WEST); if (bitorder_.size() > 1) { // only need to preserve other banks if we have more than 1 bank DigitalRegister block_mask = DigitalRegister(superpixel_size_, superpixel_size_); - cv::Mat& bm = block_mask.read(); + cv::Mat bm; +#ifdef USE_CUDA + block_mask.read().download(bm); +#else + bm = block_mask.read().getMat(cv::ACCESS_RW); +#endif for (size_t b = 0; b < bitorder_.size(); b++) { for (size_t row = 0; row < superpixel_size_; row++) { for (size_t col = 0; col < superpixel_size_; col++) { @@ -293,12 +369,19 @@ void SCAMP5E::superpixel_shift(const std::shared_ptr& dst, int bank, const } } +#ifdef USE_CUDA + block_mask.read().upload(bm); +#endif + std::shared_ptr mask = std::make_shared(cv::repeat(bm, num_of_repeats_y, num_of_repeats_x)); std::shared_ptr and_ = std::make_shared(rows, cols); AND(and_, mask, src); OR(dst, and_, dst); + + } + } void SCAMP5E::superpixel_shift_right(const std::shared_ptr& dst, int bank, const std::shared_ptr& src) { @@ -462,7 +545,13 @@ void SCAMP5E::histogram(const std::shared_ptr& src) { this->dram_.reset(); int blocksize = 8; int block = 0; - cv::Mat& s = src->read(); + + cv::Mat s; +#ifdef USE_CUDA + src->read().download(s); +#else + s = src->read().getMat(cv::ACCESS_READ); +#endif for (int row = 0; row < rows_; row += blocksize) { for (int col = 0; col < cols_; col += blocksize) { diff --git a/scamp5_multiplexed/jpeg.py b/scamp5_multiplexed/jpeg.py deleted file mode 100644 index 4db54d0..0000000 --- a/scamp5_multiplexed/jpeg.py +++ /dev/null @@ -1,147 +0,0 @@ -import numpy as np -from PIL import Image - -def save(image, name): - Image.fromarray(image).save(name + ".png") - - -def proper(image): - image -= 128 - print(np.amax(image), np.amin(image)) - shape = image.shape - print(shape) - - T_list = [[.3536, .3536, .3536, .3536, .3536, .3536, .3536, .3536], - [.4904, .4157, .2778, .0975, -.0975, -.2778, -.4157, -.4904], - [.4619, .1913, -.1913, -.4619, -.4619, -.1913, .1913, .4619], - [.4157, -.0975, -.4904, -.2778, .2778, .4904, .0975, -.4157], - [.3536, -.3536, -.3536, .3536, .3536, -.3536, -.3536, .3536], - [.2778, -.4904, .0975, .4157, -.4157, -.0975, .4904, -.2778], - [.1913, -.4619, .4619, -.1913, -.1913, .4619, -.4619, .1913], - [.0975, -.2778, .4157, -.4904, .4904, -.4157, .2778, -.0975]] - - - T = np.array(T_list) - T_trans = T.T - - Q50_list = [[16, 11, 10, 16, 24, 40, 51, 61], - [12, 12, 14, 19, 26, 58, 60, 55], - [14, 13, 16, 24, 40, 57, 69, 56], - [14, 17, 22, 29, 51, 87, 80, 62], - [18, 22, 37, 56, 68, 109, 103, 77], - [24, 35, 55, 64, 81, 104, 113, 92], - [49, 64, 78, 87, 103, 121, 120, 101], - [72, 92, 95, 98, 112, 100, 103, 99]] - Q50 = np.array(Q50_list) - Q50 = np.tile(Q50, (shape[0] // 8, shape[1] // 8)) - - D = np.zeros(shape) - for row in range(0, shape[0], 8): - for col in range(0, shape[1], 8): - D[row:row + 8, col:col + 8] = T @ image[row:row + 8, col:col + 8] @ T_trans - - - #for row in range(0, shape[0], 8): - #for col in range(0, shape[1], 8): - #D[row:row + 8, col:col + 8] = D[row:row + 8, col:col + 8] @ T_trans - - C = np.round(D / Q50) - C = C.astype(int) - - - #count = 0 - #for row in range(0, shape[0], 1): - #for col in range(0, shape[1], 1): - #if C[row][col] != 0: - #count+=1 - #print(count/(shape[0]*shape[1]) * 100, "% coefficents retained with proper") - return C, Q50, T - -def approx(image): - image -= 128 - shape = image.shape - - T_list = [[.3536, .3536, .3536, .3536, .3536, .3536, .3536, .3536], - [.4904, .4157, .2778, .0975, -.0975, -.2778, -.4157, -.4904], - [.4619, .1913, -.1913, -.4619, -.4619, -.1913, .1913, .4619], - [.4157, -.0975, -.4904, -.2778, .2778, .4904, .0975, -.4157], - [.3536, -.3536, -.3536, .3536, .3536, -.3536, -.3536, .3536], - [.2778, -.4904, .0975, .4157, -.4157, -.0975, .4904, -.2778], - [.1913, -.4619, .4619, -.1913, -.1913, .4619, -.4619, .1913], - [.0975, -.2778, .4157, -.4904, .4904, -.4157, .2778, -.0975]] - - sf = 100 - T = np.array(T_list) - T = T * sf - T = np.rint(T).astype(int) - - T_trans = T.T - - Q50_list = [[16, 11, 10, 16, 24, 40, 51, 61], - [12, 12, 14, 19, 26, 58, 60, 55], - [14, 13, 16, 24, 40, 57, 69, 56], - [14, 17, 22, 29, 51, 87, 80, 62], - [18, 22, 37, 56, 68, 109, 103, 77], - [24, 35, 55, 64, 81, 104, 113, 92], - [49, 64, 78, 87, 103, 121, 120, 101], - [72, 92, 95, 98, 112, 100, 103, 99]] - Q50 = np.array(Q50_list) - Q50 = np.tile(Q50, (shape[0] // 8, shape[1] // 8)) - - D = np.zeros(shape) - for row in range(0, shape[0], 8): - for col in range(0, shape[1], 8): - D[row:row + 8, col:col + 8] = T @ image[row:row + 8, col:col + 8] - - for row in range(0, shape[0], 8): - for col in range(0, shape[1], 8): - D[row:row + 8, col:col + 8] = D[row:row + 8, col:col + 8] @ T_trans - - D = D / (sf * sf) - D = np.rint(D).astype(int) - - C = np.round(D / Q50) - C = C.astype(int) - - #count = 0 - #for row in range(0, shape[0], 1): - #for col in range(0, shape[1], 1): - #if C[row][col] != 0: - #count+=1 - #print(count/(shape[0]*shape[1]) * 100, "% coefficents retained with approx") - return C, Q50, T - - - -def decompress(C, Q50, T, shape, div=False): - R = Q50 * C - T_trans = T.T - decompress = np.zeros(shape) - for row in range(0, shape[0], 8): - for col in range(0, shape[1], 8): - decompress[row:row + 8, col:col + 8] = np.round(T_trans @ R[row:row + 8, col:col + 8] @ T) - if div: - decompress = np.rint(decompress / (100 * 100)) - - print(np.amax(decompress), np.amin(decompress)) - decompress = (decompress + 128).astype(np.uint8) - decompress[decompress>255] = 255 - decompress[decompress<0] = 0 - print(np.amax(decompress), np.amin(decompress)) - return decompress - - -imname = "icecream" -image = np.array(Image.open('/home/jm1417/Downloads/' + imname + '.jpg').convert('L')) -C, Q50, T = proper(image) -proper_dec = decompress(C, Q50, T, image.shape) -image = np.array(Image.open('/home/jm1417/Downloads/' + imname + '.jpg').convert('L')) -C, Q50, T = approx(image) -approx_dec = decompress(C, Q50, T, image.shape, True) -save(proper_dec, imname + "_proper.jpg") -save(approx_dec, imname + "_approx.jpg") - - - - - diff --git a/scamp5_multiplexed/scamp5m.cpp b/scamp5_multiplexed/scamp5m.cpp index 527f2fc..bc5bb5b 100644 --- a/scamp5_multiplexed/scamp5m.cpp +++ b/scamp5_multiplexed/scamp5m.cpp @@ -30,7 +30,7 @@ void SCAMP5M::init() { // array for that row and col this->SET(FLAG); - classifier = read_viola_classifier("/home/jm1417/Simulator/scamp5_multiplexed/class.txt"); + // classifier = read_viola_classifier("/home/jm1417/Simulator/scamp5_multiplexed/class.txt"); } void SCAMP5M::nop() { this->update_cycles(1); } @@ -43,7 +43,12 @@ void SCAMP5M::rpix() { void SCAMP5M::get_image(AREG y) { // y := half-range image, and reset pixel - cv::Mat image = this->pe->get_pixel()->read(); + cv::Mat image; +#ifdef USE_CUDA + this->pe->get_pixel()->read().download(image); +#else + image = this->pe->get_pixel()->read().getMat(cv::ACCESS_READ); +#endif int patch = 0; for (int row = 0; row < rows_; row += row_stride_) { @@ -72,7 +77,12 @@ void SCAMP5M::get_image(AREG y) { void SCAMP5M::get_image(AREG y, AREG h) { // y := full-range image, h := negative half-range image, and reset *PIX - cv::Mat image = this->pe->get_pixel()->read(); + cv::Mat image; +#ifdef USE_CUDA + this->pe->get_pixel()->read().download(image); +#else + image = this->pe->get_pixel()->read().getMat(cv::ACCESS_READ); +#endif int patch = 0; for (int row = 0; row < rows_; row += row_stride_) { @@ -562,54 +572,46 @@ void SCAMP5M::movx(AREG y, AREG x0, news_t dir) { PlaneParams p; get_dir_params(p, dir, 0, 0, this->rows_, this->cols_, 1, 1); - int patch = 0; - for (int row = p.row_start; p.row_op(row, p.row_end); row += p.row_step) { - for (int col = p.col_start; p.col_op(col, p.col_end); col += p.col_step) { - int elem = 0; - for (int r = row; r < row + row_stride_; r++) { - for (int c = col; c < col + col_stride_; c++) { - int x0_val = 0; - switch (dir) { - case east: { - int dram_row_select = col - 1; // TOP_RIGHT origin means it's -1 - if (dram_row_select >= 0) { - x0_val = this->dram->read_signed_byte(row, dram_row_select, x0); - } - break; - } - case west: { - int dram_row_select = col + 1; // TOP_RIGHT origin means it's +1 - if (dram_row_select < this->cols_) { - x0_val = this->dram->read_signed_byte(row, dram_row_select, x0); - } - break; - }; - case north: { - int dram_array_select = row - 1; // TOP_RIGHT origin means it's -1 - if (dram_array_select < this->rows_) { - x0_val = this->dram->read_signed_byte(dram_array_select, col, x0); - } - break; - }; - case south: { - int dram_array_select = row + 1; // TOP_RIGHT origin means it's +1 - if (dram_array_select >= 0) { - x0_val = this->dram->read_signed_byte(dram_array_select, col, x0); - } - break; - }; - case alldir: { - std::cerr << "Unhandled direction" << std::endl; - break; - }; + for (int col = p.col_start; p.col_op(col, p.col_end); col += p.col_step) { + for (int row = p.row_start; p.row_op(row, p.row_end); row += p.row_step) { + int x0_val = 0; + switch (dir) { + case east: { + int dram_row_select = col - 1; // TOP_RIGHT origin means it's -1 + if (dram_row_select >= 0) { + x0_val = this->dram->read_signed_byte(row, dram_row_select, x0); } - int neg = this->alu->execute(0, x0_val, ALU::SUB); - this->dram->write_signed_byte(row, col, y, x0_val); - this->dram->write_signed_byte(row, col, NEWS, neg); - elem++; + break; } + case west: { + int dram_row_select = col + 1; // TOP_RIGHT origin means it's +1 + if (dram_row_select < this->cols_) { + x0_val = this->dram->read_signed_byte(row, dram_row_select, x0); + } + break; + }; + case north: { + int dram_array_select = row - 1; // TOP_RIGHT origin means it's -1 + if (dram_array_select < this->rows_) { + x0_val = this->dram->read_signed_byte(dram_array_select, col, x0); + } + break; + }; + case south: { + int dram_array_select = row + 1; // TOP_RIGHT origin means it's +1 + if (dram_array_select >= 0) { + x0_val = this->dram->read_signed_byte(dram_array_select, col, x0); + } + break; + }; + case alldir: { + std::cerr << "Unhandled direction" << std::endl; + break; + }; } - patch++; + int neg = this->alu->execute(0, x0_val, ALU::SUB); + this->dram->write_signed_byte(row, col, y, x0_val); + this->dram->write_signed_byte(row, col, NEWS, neg); } } @@ -2253,7 +2255,7 @@ void SCAMP5M::display() { } std::shared_ptr SCAMP5M::read_viola_classifier(const std::string &classifier_path) { - int stages = 25; // number of stages + int stages = 25; // number of stages /*total number of weak classifiers (one node each)*/ int total_nodes = 2913; int i, j, k, l; @@ -2266,20 +2268,19 @@ std::shared_ptr SCAMP5M::read_viola_classifier(const std::string & std::vector stages_array {9, 16, 27, 32, 52, 53, 62, 72, 83, 91, 99, 115, 127, 135, 136, 137, 159, 155, 169, 196, 197, 181, 199, 211, 200}; - /* TODO: use matrices where appropriate */ /*********************************************** * Allocate a lot of array structures * Note that, to increase parallelism, * some arrays need to be splitted or duplicated **********************************************/ - std::shared_ptr > rectangles_array = std::make_shared >(total_nodes * 12); - std::shared_ptr > scaled_rectangles_array = std::make_shared >(total_nodes * 12); - std::shared_ptr > weights_array = std::make_shared >(total_nodes * 3); - std::shared_ptr > alpha1_array = std::make_shared >(total_nodes); - std::shared_ptr > alpha2_array = std::make_shared >(total_nodes); - std::shared_ptr > tree_thresh_array = std::make_shared >(total_nodes); - std::shared_ptr > stages_thresh_array = std::make_shared >(stages); + std::shared_ptr> rectangles_array = std::make_shared>(total_nodes * 12); + std::shared_ptr> scaled_rectangles_array = std::make_shared>(total_nodes * 12); + std::shared_ptr> weights_array = std::make_shared>(total_nodes * 3); + std::shared_ptr> alpha1_array = std::make_shared>(total_nodes); + std::shared_ptr> alpha2_array = std::make_shared>(total_nodes); + std::shared_ptr> tree_thresh_array = std::make_shared>(total_nodes); + std::shared_ptr> stages_thresh_array = std::make_shared>(stages); FILE *fp = fopen(classifier_path.data(), "r"); /****************************************** @@ -2307,9 +2308,9 @@ std::shared_ptr SCAMP5M::read_viola_classifier(const std::string & * 18: alpha 2 of the filter ******************************************/ /* loop over n of stages */ - for (i = 0; i < stages; i++) { /* loop over n of trees */ - for (j = 0; j < stages_array[i]; j++) { /* loop over n of rectangular features */ - for (k = 0; k < 3; k++) { /* loop over the n of vertices */ + for (i = 0; i < stages; i++) { /* loop over n of trees */ + for (j = 0; j < stages_array[i]; j++) { /* loop over n of rectangular features */ + for (k = 0; k < 3; k++) { /* loop over the n of vertices */ for (l = 0; l < 4; l++) { if (fgets(mystring, 12, fp) != nullptr) rectangles_array->at(r_index) = atoi(mystring); @@ -2365,10 +2366,10 @@ std::shared_ptr SCAMP5M::read_viola_classifier(const std::string & } inline int int_round(float value) { - return (int) (value + (value >= 0 ? 0.5 : -0.5)); + return (int)(value + (value >= 0 ? 0.5 : -0.5)); } -std::vector SCAMP5M::vj_detect(const std::shared_ptr& src, std::shared_ptr classifier, Size minSize, Size maxSize, float scaleFactor, int minNeighbors) { +std::vector SCAMP5M::vj_detect(const std::shared_ptr &src, std::shared_ptr classifier, Size minSize, Size maxSize, float scaleFactor, int minNeighbors) { /* group overlaping windows */ const float GROUP_EPS = 0.4f; @@ -2376,7 +2377,6 @@ std::vector SCAMP5M::vj_detect(const std::shared_ptr& src, std: // D for sum // E for sqsum - std::shared_ptr img1 = std::make_shared(src->width, src->height, C, 1); std::shared_ptr sum1 = std::make_shared(src->width, src->height, D, 1); std::shared_ptr sqsum = std::make_shared(src->width, src->height, E, 1); @@ -2414,7 +2414,6 @@ std::vector SCAMP5M::vj_detect(const std::shared_ptr& src, std: if (winSize.width < minSize.width || winSize.height < minSize.height) continue; - img1->width = sz.width; img1->height = sz.height; @@ -2429,9 +2428,9 @@ std::vector SCAMP5M::vj_detect(const std::shared_ptr& src, std: * downsampling using nearest neighbor **************************************/ vj_downsample(img1, src); -// cv::Mat img1v = this->readout(C); -// cv::imshow("Scaled", img1v); -// cv::waitKey(1); + // cv::Mat img1v = this->readout(C); + // cv::imshow("Scaled", img1v); + // cv::waitKey(1); /*************************************************** * Compute-intensive step: @@ -2439,21 +2438,21 @@ std::vector SCAMP5M::vj_detect(const std::shared_ptr& src, std: * compute a new integral and squared integral image ***************************************************/ vj_integral_image(img1, sum1, sqsum); -// for (int y = 0; y < sum1->height; ++y) { -// for (int x = 0; x < sum1->width; ++x) { -// std::cout << this->dram->read_signed_int(y, x, sum1->reg) << " "; -// } -// } -// std::cout << "----------------------------\n"; -// exit(EXIT_FAILURE); - -// cv::Mat sum1v = this->readout(D); - -// cv::imshow("Sum", sum1v); -// double minVal, maxVal; -// cv::minMaxLoc(sum1v, &minVal, &maxVal); -//// std::cout << minVal << " " << maxVal << std::endl; -// cv::waitKey(1); + // for (int y = 0; y < sum1->height; ++y) { + // for (int x = 0; x < sum1->width; ++x) { + // std::cout << this->dram->read_signed_int(y, x, sum1->reg) << " "; + // } + // } + // std::cout << "----------------------------\n"; + // exit(EXIT_FAILURE); + + // cv::Mat sum1v = this->readout(D); + + // cv::imshow("Sum", sum1v); + // double minVal, maxVal; + // cv::minMaxLoc(sum1v, &minVal, &maxVal); + //// std::cout << minVal << " " << maxVal << std::endl; + // cv::waitKey(1); /* sets images for haar classifier cascade */ /************************************************** @@ -2479,7 +2478,7 @@ std::vector SCAMP5M::vj_detect(const std::shared_ptr& src, std: std::shared_ptr> sqsum_val = vj_readout(E); vj_scale_invoke(classifier, sum_val, sqsum_val, factor, sum1->height, sum1->width, - allCandidates); + allCandidates); } /* end of the factor loop, finish all scales in pyramid*/ if (minNeighbors != 0) { @@ -2490,7 +2489,6 @@ std::vector SCAMP5M::vj_detect(const std::shared_ptr& src, std: std::cout << "Found face" << std::endl; } return allCandidates; - } void SCAMP5M::vj_set_image_for_cascade(std::shared_ptr classifier, std::shared_ptr sum, std::shared_ptr sqsum) { @@ -2557,12 +2555,9 @@ void SCAMP5M::vj_set_image_for_cascade(std::shared_ptr classifier, w_index += 3; } /* end of j loop */ } /* end i loop */ - } - -void SCAMP5M::vj_scale_invoke(std::shared_ptr classifier, std::shared_ptr> sum_val, std::shared_ptr> sqsum_val, float _factor, int sum_row, int sum_col, std::vector& allCandidates) { - +void SCAMP5M::vj_scale_invoke(std::shared_ptr classifier, std::shared_ptr> sum_val, std::shared_ptr> sqsum_val, float _factor, int sum_row, int sum_col, std::vector &allCandidates) { float factor = _factor; Point p; int result; @@ -2588,7 +2583,7 @@ void SCAMP5M::vj_scale_invoke(std::shared_ptr classifier, std::sha *********************************************/ for (x = 0; x < x2; x += step) for (y = 0; y < y2; y += step) { -// std::cout << "x: " << x << " y: " << y << "\n"; + // std::cout << "x: " << x << " y: " << y << "\n"; p.x = x; p.y = y; @@ -2615,7 +2610,7 @@ int SCAMP5M::run_vj_classifier(std::shared_ptr classifier, std::sh int r_index = 0; int stage_sum; -// std::cout << sum_val->size() << std::endl; + // std::cout << sum_val->size() << std::endl; p_offset = pt.y * (classifier->sum_img->width) + pt.x; pq_offset = pt.y * (classifier->sqsum_img->width) + pt.x; @@ -2645,7 +2640,6 @@ int SCAMP5M::run_vj_classifier(std::shared_ptr classifier, std::sh * send the shifted window through cascade filter. *************************************************/ for (i = start_stage; i < classifier->stages; i++) { - stage_sum = 0; for (j = 0; j < classifier->stages_array_->at(i); j++) { @@ -2675,32 +2669,18 @@ int SCAMP5M::run_vj_classifier(std::shared_ptr classifier, std::sh } inline int SCAMP5M::evalWeakClassifier(std::shared_ptr classifier, std::shared_ptr> sum_val, int variance_norm_factor, int p_offset, int tree_index, int w_index, int r_index) { - /* the node threshold is multiplied by the standard deviation of the image */ int t = classifier->tree_thresh_array_->at(tree_index) * variance_norm_factor; int i = classifier->scaled_rectangles_array_->at(r_index + 2); int index = i + p_offset; -// std::cout << i << ", " << p_offset << std::endl; - int sum = (sum_val->at(classifier->scaled_rectangles_array_->at(r_index) + p_offset) - - sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 1) + p_offset) - - sum_val->at(index) - + sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 3) + p_offset)) - * classifier->weights_array_->at(w_index); - + // std::cout << i << ", " << p_offset << std::endl; + int sum = (sum_val->at(classifier->scaled_rectangles_array_->at(r_index) + p_offset) - sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 1) + p_offset) - sum_val->at(index) + sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 3) + p_offset)) * classifier->weights_array_->at(w_index); - sum += (sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 4) + p_offset) - - sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 5) + p_offset) - - sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 6) + p_offset) - + sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 7) + p_offset)) - * classifier->weights_array_->at(w_index + 1); + sum += (sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 4) + p_offset) - sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 5) + p_offset) - sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 6) + p_offset) + sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 7) + p_offset)) * classifier->weights_array_->at(w_index + 1); if ((classifier->scaled_rectangles_array_->at(r_index + 8) != -1)) - sum += (sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 8) + p_offset) - - sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 9)+ p_offset) - - sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 10) + p_offset) - + sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 11) + p_offset)) - * classifier->weights_array_->at(w_index + 2); + sum += (sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 8) + p_offset) - sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 9) + p_offset) - sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 10) + p_offset) + sum_val->at(classifier->scaled_rectangles_array_->at(r_index + 11) + p_offset)) * classifier->weights_array_->at(w_index + 2); if (sum >= t) return classifier->alpha2_array_->at(tree_index); @@ -2722,31 +2702,68 @@ std::shared_ptr> SCAMP5M::vj_readout(AREG src) { return out; } -void SCAMP5M::vj_downsample(std::shared_ptr dst, std::shared_ptr src) { - // nearest neighbour downsampling +void SCAMP5M::downsample(AREG dst, AREG src, float sf) { + std::shared_ptr dst_img = std::make_shared(this->rows_ * sf, this->cols_ * sf, dst, 1); + std::shared_ptr src_img = std::make_shared(this->rows_, this->cols_, src, 1); - int y; - int x; + vj_downsample(dst_img, src_img); +} + +void SCAMP5M::vj_horizontal_downsample(std::shared_ptr dst, std::shared_ptr src) { int w1 = src->width; int h1 = src->height; int w2 = dst->width; int h2 = dst->height; - int rat = 0; + std::cout << w1 << " " << h1 << " " << w2 << " " << h2 << std::endl; - int x_ratio = (int) ((w1 << 16) / w2) + 1; - int y_ratio = (int) ((h1 << 16) / h2) + 1; + for (int row = 0; row < h2; ++row) { + for (int col = 0; col < w2; ++col) { + int src_x = (int)((float)col / (float)w2 * (float)w1); + src_x = std::min(src_x, w1 - 1); - for (int i = 0; i < h2; i++) { - y = ((i * y_ratio) >> 16); - rat = 0; - for (int j = 0; j < w2; j++) { - x = (rat >> 16); - int val = this->dram->read_signed_int(y, x, src->reg); - this->dram->write_signed_int(i, j, dst->reg, val); - rat += x_ratio; + int val = this->dram->read_signed_byte(row, src_x, src->reg); + this->dram->write_signed_byte(row, col, dst->reg, val); } } + + // only go through + this->update_cycles(w2 * 18); + dram->update_dynamic(w2 * 16); + alu->update_dynamic(w2 * 2); +} + +void SCAMP5M::vj_downsample(std::shared_ptr dst, std::shared_ptr src) { + // nearest neighbour downsampling + + std::shared_ptr tmp = std::make_shared(src->height, dst->width, E, 1); + std::shared_ptr tmp2 = std::make_shared(src->height, dst->width, F, 1); + + vj_horizontal_downsample(tmp2, src); + vj_transpose(tmp, tmp2); + vj_horizontal_downsample(tmp2, tmp); + vj_transpose(dst, tmp2); +} + +void SCAMP5M::integral_image() { + // first summation + + this->update_cycles(cols_ * 66); + dram->update_dynamic(cols_ * 64); + alu->update_dynamic(cols_ * 2); + + // first transpose + this->update_cycles(rows_ * cols_ * 64); + dram->update_dynamic(rows_ * cols_ * 64); + + // second summation + this->update_cycles(cols_ * 66); + dram->update_dynamic(cols_ * 64); + alu->update_dynamic(cols_ * 2); + + // second transpose + this->update_cycles(rows_ * cols_ * 64); + dram->update_dynamic(rows_ * cols_ * 64); } void SCAMP5M::vj_integral_image(std::shared_ptr src, std::shared_ptr sum_image, std::shared_ptr sqrsum_image) { @@ -2754,9 +2771,8 @@ void SCAMP5M::vj_integral_image(std::shared_ptr src, std::shared_ptrheight; int width = src->width; - -// std::shared_ptr sum_temp = std::make_shared(width, height, NEWS, 1); -// std::shared_ptr sqsum_temp = std::make_shared(width, height, F, 1); + // std::shared_ptr sum_temp = std::make_shared(width, height, NEWS, 1); + // std::shared_ptr sqsum_temp = std::make_shared(width, height, F, 1); int x, y, s, sq, t, tq; unsigned char it; @@ -2781,134 +2797,134 @@ void SCAMP5M::vj_integral_image(std::shared_ptr src, std::shared_ptrdram->read_signed_byte(patch, index(r-row, c-col, cols_), src) + 128; -// std::cout << i << ","; -// } -// } -// patch++; -// } -// std::cout << "\n"; -// } + // for (int row = 0; row < rows_; row += row_stride_) { + // for (int col = 0; col < cols_; col += col_stride_) { + // for (int r = row; r < row + row_stride_; r++) { + // for (int c = col; c < col + col_stride_; c++) { + // int i = this->dram->read_signed_byte(patch, index(r-row, c-col, cols_), src) + 128; + // std::cout << i << ","; + // } + // } + // patch++; + // } + // std::cout << "\n"; + // } //sum up each row in parallel -// for (int row = 0; row < height; row ++) { -// int sum = 0; -// int sqsum = 0; -// for (int col = 0; col < width; col ++) { -// int i = this->dram->read_signed_int(row, col, src->reg); -// sum = this->alu->execute(sum, i, ALU::ADD); -// int sq = this->alu->execute(i, i, ALU::MUL); -// sqsum = this->alu->execute(sq, sqsum, ALU::ADD); -// this->dram->write_signed_int(row, col, sum_temp->reg, sum); -// this->dram->write_signed_int(row, col, sqsum_temp->reg, sq); -// } -// } - -// std::cout << "Row sums-------------" << std::endl; -// patch = 0; -// for (int row = 0; row < rows_; row += row_stride_) { -// for (int col = 0; col < cols_; col += col_stride_) { -// for (int r = row; r < row + row_stride_; r++) { -// for (int c = col; c < col + col_stride_; c++) { -// int i = this->dram->read_signed_int(patch, index(r-row, c-col, cols_), F); -// std::cout << i << ","; -// } -// } -// patch++; -// } -// std::cout << "\n"; -// } - -// vj_transpose(sum_image, sum_temp); -// vj_transpose(sqrsum_image, sqsum_temp); -// -// std::cout << "Transpose -------------" << std::endl; -// patch = 0; -// for (int row = 0; row < rows_; row += row_stride_) { -// for (int col = 0; col < cols_; col += col_stride_) { -// for (int r = row; r < row + row_stride_; r++) { -// for (int c = col; c < col + col_stride_; c++) { -// int i = this->dram->read_signed_int(patch, index(r-row, c-col, cols_), sqrsum_image); -// std::cout << i << ","; -// } -// } -// patch++; -// } -// std::cout << "\n"; -// } + // for (int row = 0; row < height; row ++) { + // int sum = 0; + // int sqsum = 0; + // for (int col = 0; col < width; col ++) { + // int i = this->dram->read_signed_int(row, col, src->reg); + // sum = this->alu->execute(sum, i, ALU::ADD); + // int sq = this->alu->execute(i, i, ALU::MUL); + // sqsum = this->alu->execute(sq, sqsum, ALU::ADD); + // this->dram->write_signed_int(row, col, sum_temp->reg, sum); + // this->dram->write_signed_int(row, col, sqsum_temp->reg, sq); + // } + // } + + // std::cout << "Row sums-------------" << std::endl; + // patch = 0; + // for (int row = 0; row < rows_; row += row_stride_) { + // for (int col = 0; col < cols_; col += col_stride_) { + // for (int r = row; r < row + row_stride_; r++) { + // for (int c = col; c < col + col_stride_; c++) { + // int i = this->dram->read_signed_int(patch, index(r-row, c-col, cols_), F); + // std::cout << i << ","; + // } + // } + // patch++; + // } + // std::cout << "\n"; + // } + + // vj_transpose(sum_image, sum_temp); + // vj_transpose(sqrsum_image, sqsum_temp); + // + // std::cout << "Transpose -------------" << std::endl; + // patch = 0; + // for (int row = 0; row < rows_; row += row_stride_) { + // for (int col = 0; col < cols_; col += col_stride_) { + // for (int r = row; r < row + row_stride_; r++) { + // for (int c = col; c < col + col_stride_; c++) { + // int i = this->dram->read_signed_int(patch, index(r-row, c-col, cols_), sqrsum_image); + // std::cout << i << ","; + // } + // } + // patch++; + // } + // std::cout << "\n"; + // } //sum up each row in parallel (now comtains cols) -// for (int row = 0; row < height; row ++) { -// int sum = 0; -// int sqsum = 0; -// for (int col = 0; col < width; col ++) { -// int i = this->dram->read_signed_int(row, col, sum_image->reg); -// sum = this->alu->execute(sum, i, ALU::ADD); -// this->dram->write_signed_int(row, col, sum_temp->reg, sum); -// -// int j = this->dram->read_signed_int(row, col, sqrsum_image->reg); -// sqsum = this->alu->execute(sqsum, j, ALU::ADD); -// this->dram->write_signed_int(row, col, sqsum_temp->reg, sqsum); -// } -// } - -// std::cout << "Sum cols -------------" << std::endl; -// patch = 0; -// for (int row = 0; row < rows_; row += row_stride_) { -// for (int col = 0; col < cols_; col += col_stride_) { -// for (int r = row; r < row + row_stride_; r++) { -// for (int c = col; c < col + col_stride_; c++) { -// int i = this->dram->read_signed_int(patch, index(r-row, c-col, cols_), F); -// std::cout << i << ","; -// } -// } -// patch++; -// } -// std::cout << "\n"; -// } - - -// vj_transpose(sum_image, sum_temp); -// vj_transpose(sqrsum_image, sqsum_temp); - -// std::cout << "Final sums -------------" << std::endl; -// patch = 0; -// for (int row = 0; row < rows_; row += row_stride_) { -// for (int col = 0; col < cols_; col += col_stride_) { -// for (int r = row; r < row + row_stride_; r++) { -// for (int c = col; c < col + col_stride_; c++) { -// int i = this->dram->read_signed_int(patch, index(r-row, c-col, cols_), sum_image->reg); -// std::cout << i << ","; -// } -// } -// patch++; -// } -// std::cout << "\n"; -// } - + // for (int row = 0; row < height; row ++) { + // int sum = 0; + // int sqsum = 0; + // for (int col = 0; col < width; col ++) { + // int i = this->dram->read_signed_int(row, col, sum_image->reg); + // sum = this->alu->execute(sum, i, ALU::ADD); + // this->dram->write_signed_int(row, col, sum_temp->reg, sum); + // + // int j = this->dram->read_signed_int(row, col, sqrsum_image->reg); + // sqsum = this->alu->execute(sqsum, j, ALU::ADD); + // this->dram->write_signed_int(row, col, sqsum_temp->reg, sqsum); + // } + // } + + // std::cout << "Sum cols -------------" << std::endl; + // patch = 0; + // for (int row = 0; row < rows_; row += row_stride_) { + // for (int col = 0; col < cols_; col += col_stride_) { + // for (int r = row; r < row + row_stride_; r++) { + // for (int c = col; c < col + col_stride_; c++) { + // int i = this->dram->read_signed_int(patch, index(r-row, c-col, cols_), F); + // std::cout << i << ","; + // } + // } + // patch++; + // } + // std::cout << "\n"; + // } + + // vj_transpose(sum_image, sum_temp); + // vj_transpose(sqrsum_image, sqsum_temp); + + // std::cout << "Final sums -------------" << std::endl; + // patch = 0; + // for (int row = 0; row < rows_; row += row_stride_) { + // for (int col = 0; col < cols_; col += col_stride_) { + // for (int r = row; r < row + row_stride_; r++) { + // for (int c = col; c < col + col_stride_; c++) { + // int i = this->dram->read_signed_int(patch, index(r-row, c-col, cols_), sum_image->reg); + // std::cout << i << ","; + // } + // } + // patch++; + // } + // std::cout << "\n"; + // } } void SCAMP5M::vj_transpose(std::shared_ptr dst, std::shared_ptr src) { - for (int row = 0; row < src->height; row ++) { - for (int col = 0; col < src->width; col ++) { - int i = this->dram->read_signed_int(col, row, src->reg); - this->dram->write_signed_int(row, col, dst->reg, i); + for (int row = 0; row < src->height; row++) { + for (int col = 0; col < src->width; col++) { + int i = this->dram->read_signed_byte(col, row, src->reg); + this->dram->write_signed_byte(row, col, dst->reg, i); } } + this->update_cycles(num_pixels * 16); + dram->update_dynamic(num_pixels * 16); } //todo write proper tests until this vj stuff works cv::Mat SCAMP5M::readout(AREG areg) { cv::Mat val = cv::Mat::zeros(cv::Size(rows_, cols_), CV_8U); - for (int row = 0; row < rows_; row ++) { + for (int row = 0; row < rows_; row++) { for (int col = 0; col < cols_; col++) { int i = this->dram->read_signed_int(row, col, areg); - val.at(row, col) = i ; + val.at(row, col) = i; } } return val; @@ -2916,15 +2932,15 @@ cv::Mat SCAMP5M::readout(AREG areg) { void SCAMP5M::viola_jones(AREG areg) { cv::Mat val = cv::Mat::zeros(cv::Size(rows_, cols_), CV_8U); - for (int row = 0; row < rows_; row ++) { + for (int row = 0; row < rows_; row++) { for (int col = 0; col < cols_; col++) { int i = this->dram->read_signed_byte(row, col, areg); this->dram->write_signed_int(row, col, areg, i + 128); - val.at(row, col) = i+128; + val.at(row, col) = i + 128; } } -// cv::Mat val = this->readout(A); + // cv::Mat val = this->readout(A); Size minSize = {20, 20}; Size maxSize = {0, 0}; @@ -2942,12 +2958,12 @@ void SCAMP5M::viola_jones(AREG areg) { for (auto &face: faces) { cv::Point center(face.x + face.width / 2, face.y + face.height / 2); cv::rectangle(val, face, cv::Scalar(255, 0, 255)); -// ellipse(val, center, cv::Size(face.width / 2, face.height / 2), 0, 0, 360, cv::Scalar(255, 0, 255), 4); + // ellipse(val, center, cv::Size(face.width / 2, face.height / 2), 0, 0, 360, cv::Scalar(255, 0, 255), 4); } double minVal, maxVal; -// cv::minMaxLoc(val, &minVal, &maxVal); -// std::cout << minVal << " " << maxVal << std::endl; + // cv::minMaxLoc(val, &minVal, &maxVal); + // std::cout << minVal << " " << maxVal << std::endl; cv::imshow("later", val); cv::waitKey(1); } @@ -2988,7 +3004,6 @@ void SCAMP5M::jpeg_compression(AREG dst, AREG src) { {49, 64, 78, 87, 103, 121, 120, 101}, {72, 92, 95, 98, 112, 100, 103, 99}}; - int sf = 32 * 32; // left multiply with DCT @@ -3006,7 +3021,6 @@ void SCAMP5M::jpeg_compression(AREG dst, AREG src) { sum = this->alu->execute(sum, mult, ALU::ADD); } this->dram->write_signed_int(patch, index(r - row, c - col, 8), dst, sum); - } } patch++; @@ -3050,7 +3064,6 @@ void SCAMP5M::jpeg_compression(AREG dst, AREG src) { patch++; } } - } //move to base class @@ -3338,5 +3351,7 @@ RTTR_REGISTRATION { .method("vj_transpose", &SCAMP5M::vj_transpose) .method("vj_integral_image", &SCAMP5M::vj_integral_image) .method("vj_readout", &SCAMP5M::vj_readout) - .method("jpeg_compression", &SCAMP5M::jpeg_compression); + .method("downsample", &SCAMP5M::downsample) + .method("jpeg_compression", &SCAMP5M::jpeg_compression) + .method("integral_image", &SCAMP5M::integral_image); } diff --git a/scamp5_multiplexed/scamp5m.h b/scamp5_multiplexed/scamp5m.h index e7e2b74..fa40441 100644 --- a/scamp5_multiplexed/scamp5m.h +++ b/scamp5_multiplexed/scamp5m.h @@ -19,14 +19,14 @@ enum AREG { PIX = 0, - IN = 32, - NEWS = 64, - A = 96, - B = 128, - C = 160, - D = 192, - E = 224, - F = 256 + IN = 8, + NEWS = 16, + A = 24, + B = 32, + C = 40, + D = 48, + E = 56, + F = 64 }; enum DREG { @@ -336,8 +336,11 @@ class SCAMP5M : public Architecture { void init_viola(); std::shared_ptr read_viola_classifier(const std::string& classifier_path); std::vector vj_detect(const std::shared_ptr& src, std::shared_ptr classifier, Size minSize, Size maxSize, float scaleFactor, int minNeighbors); + void downsample(AREG dst, AREG src, float sf); + void vj_horizontal_downsample(std::shared_ptr dst, std::shared_ptr src); void vj_downsample(std::shared_ptr dst, std::shared_ptr src); void vj_scale_invoke(std::shared_ptr classifier, std::shared_ptr> sum_val, std::shared_ptr> sqsum_val, float _factor, int sum_row, int sum_col, std::vector &_vec); + void integral_image(); void vj_integral_image(std::shared_ptr src, std::shared_ptr sum_image, std::shared_ptr sqrsum_image); void vj_transpose(std::shared_ptr dst, std::shared_ptr src); void vj_set_image_for_cascade(std::shared_ptr classifier, std::shared_ptr sum, std::shared_ptr sqsum); diff --git a/scamp5_multiplexed/scamp5rmalt.cpp b/scamp5_multiplexed/scamp5rmalt.cpp index 19d3768..0799bdd 100644 --- a/scamp5_multiplexed/scamp5rmalt.cpp +++ b/scamp5_multiplexed/scamp5rmalt.cpp @@ -81,7 +81,12 @@ void SCAMP5RMALT::motion() { void SCAMP5RMALT::get_image(AREG y, AREG h) { // y := full-range image, h := negative half-range image, and reset *PIX - cv::Mat image = this->pe->get_pixel()->read(); + cv::Mat image; +#ifdef USE_CUDA + this->pe->get_pixel()->read().download(image); +#else + image = this->pe->get_pixel()->read().getMat(cv::ACCESS_READ); +#endif for (int row = 0; row < this->rows_; ++row) { for (int col = 0; col < this->cols_; ++col) { diff --git a/scamp5_multiplexed/ssim.py b/scamp5_multiplexed/ssim.py deleted file mode 100644 index d7d0fac..0000000 --- a/scamp5_multiplexed/ssim.py +++ /dev/null @@ -1,32 +0,0 @@ -# Usage: -# -# python3 script.py --input original.png --output modified.png -# Based on: https://github.com/mostafaGwely/Structural-Similarity-Index-SSIM- - -# 1. Import the necessary packages -from skimage.metrics import structural_similarity as ssim -import argparse -import imutils -import cv2 - -# 2. Construct the argument parse and parse the arguments -ap = argparse.ArgumentParser() -ap.add_argument("-f", "--first", required=True, help="Directory of the image that will be compared") -ap.add_argument("-s", "--second", required=True, help="Directory of the image that will be used to compare") -args = vars(ap.parse_args()) - -# 3. Load the two input images -imageA = cv2.imread(args["first"]) -imageB = cv2.imread(args["second"]) - -# 4. Convert the images to grayscale -grayA = cv2.cvtColor(imageA, cv2.COLOR_BGR2GRAY) -grayB = cv2.cvtColor(imageB, cv2.COLOR_BGR2GRAY) - -# 5. Compute the Structural Similarity Index (SSIM) between the two -# images, ensuring that the difference image is returned -(score, diff) = ssim(grayA, grayB, full=True) -diff = (diff * 255).astype("uint8") - -# 6. You can print only the score if you want -print("SSIM: {}".format(score)) diff --git a/src/simulator/adc/adc.cpp b/src/simulator/adc/adc.cpp index 0a41eee..c03d595 100644 --- a/src/simulator/adc/adc.cpp +++ b/src/simulator/adc/adc.cpp @@ -6,20 +6,10 @@ #include #include "simulator/util/utility.h" - void ADC::init() { #ifdef TRACK_STATISTICS - transistor_count_ = calc_transistor_count(); - static_power_ = calc_static(); - dynamic_power_ = calc_dynamic(); - width_ = calc_width(); - height_ = calc_height(); time_ = (this->cycle_count_ * (1.0 / config_->get_clock_rate())); - internal_mask = cv::Mat(rows_, cols_, CV_8U, cv::Scalar(0)); - array_transistor_count_ = cv::Mat(rows_, cols_, CV_32S, cv::Scalar(0)); - array_static_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - array_dynamic_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - this->calc_internal_mask(); + Component::init(); #endif } diff --git a/src/simulator/adders/cla.cpp b/src/simulator/adders/cla.cpp index b2168ee..d51b70a 100644 --- a/src/simulator/adders/cla.cpp +++ b/src/simulator/adders/cla.cpp @@ -12,21 +12,12 @@ /*Bits refers to the number of bits in the two inputs and the output. So an 8-bit adders takes in two 8-bit values and outputs an 8-bit value*/ void CarryLookAheadAdder::init() { - internal_mask = cv::Mat(rows_, cols_, CV_8U, cv::Scalar(0)); #ifdef TRACK_STATISTICS cycle_count_ = 1; - transistor_count_ = calc_transistor_count(); - static_power_ = calc_static(); - dynamic_power_ = calc_dynamic(); - width_ = calc_width(); - height_ = calc_height(); time_ = this->cycle_count_ * (1.0 / config_->get_clock_rate()); - array_transistor_count_ = cv::Mat(rows_, cols_, CV_32S, cv::Scalar(0)); - array_static_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - array_dynamic_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - scratch = cv::Mat(rows_, cols_, CV_8U, cv::Scalar(0)); + scratch = cv::UMat(rows_, cols_, CV_8U, cv::Scalar(0)); + Component::init(); #endif - this->calc_internal_mask(); } #ifdef TRACK_STATISTICS diff --git a/src/simulator/alu/alu.cpp b/src/simulator/alu/alu.cpp index a31d2b5..49b4e65 100644 --- a/src/simulator/alu/alu.cpp +++ b/src/simulator/alu/alu.cpp @@ -7,17 +7,10 @@ #include void ALU::init() { - internal_mask = cv::Mat(rows_, cols_, CV_8U, cv::Scalar(0)); #ifdef TRACK_STATISTICS - transistor_count_ = calc_transistor_count(); - static_power_ = calc_static(); - dynamic_power_ = calc_dynamic(); time_ = (this->cycle_count_ * (1.0 / config_->get_clock_rate())); - array_transistor_count_ = cv::Mat(rows_, cols_, CV_32S, cv::Scalar(0)); - array_static_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - array_dynamic_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); + Component::init(); #endif - this->calc_internal_mask(); } void ALU::set_bits(int bits) { diff --git a/src/simulator/base/architecture.cpp b/src/simulator/base/architecture.cpp index 31cef9a..483ea67 100644 --- a/src/simulator/base/architecture.cpp +++ b/src/simulator/base/architecture.cpp @@ -105,6 +105,7 @@ void Architecture::print_stats(int rows, int cols) { std::cout << "Architecture static power: " << static_power << " W\n"; std::cout << "Architecture dynamic energy: " << dynamic_energy << " J\n"; std::cout << "Architecture dynamic power: " << dynamic_power << " W\n"; + std::cout << "Architecture total energy: " << static_energy + dynamic_energy << " J\n"; std::cout << "Architecture total power: " << static_power + dynamic_power << " W\n"; double width = 0; for (auto& [_, component] : components_) { diff --git a/src/simulator/base/component.cpp b/src/simulator/base/component.cpp index 41b27bf..e9b20fe 100644 --- a/src/simulator/base/component.cpp +++ b/src/simulator/base/component.cpp @@ -5,14 +5,42 @@ #include #include +void Component::init() { +#ifdef TRACK_STATISTICS + transistor_count_ = calc_transistor_count(); + static_power_ = calc_static(); + dynamic_power_ = calc_dynamic(); + width_ = calc_width(); + height_ = calc_height(); +#ifdef USE_CUDA + internal_mask = cv::cuda::GpuMat(rows_, cols_, CV_8U, cv::Scalar(0)); + array_transistor_count_ = cv::cuda::GpuMat(rows_, cols_, CV_32S, cv::Scalar(0)); + array_static_energy_ = cv::UMat(rows_, cols_, CV_64F, cv::Scalar(0)); + array_dynamic_energy_ = cv::UMat(rows_, cols_, CV_64F, cv::Scalar(0)); +#else + internal_mask = cv::UMat(rows_, cols_, CV_8U, cv::Scalar(0)); + array_transistor_count_ = cv::UMat(rows_, cols_, CV_32S, cv::Scalar(0)); + array_static_energy_ = cv::UMat(rows_, cols_, CV_64F, cv::Scalar(0)); + array_dynamic_energy_ = cv::UMat(rows_, cols_, CV_64F, cv::Scalar(0)); +#endif + this->calc_internal_mask(); +#endif +} + void Component::calc_internal_mask() { + cv::Mat im; +#ifdef USE_CUDA + this->internal_mask.download(im); +#else + im = this->internal_mask.getMat(cv::ACCESS_WRITE); +#endif for (int row = 0; row < rows_; row += row_stride_) { for (int col = 0; col < cols_; col += col_stride_) { - this->internal_mask.at(row, col) = 1; + im.at(row, col) = 1; } } -#ifdef TRACK_STATISTICS - array_transistor_count_.setTo(transistor_count_, this->internal_mask); +#ifdef USE_CUDA + this->internal_mask.upload(im); #endif } @@ -43,15 +71,30 @@ void Component::set_config(std::shared_ptr config) { #ifdef TRACK_STATISTICS cv::Mat Component::get_static_energy_array() { - return this->array_static_energy_; +#ifdef TRACK_STATISTICS + array_static_energy_.copyTo(array_static_energy_, this->internal_mask); +#endif + return this->array_static_energy_.getMat(cv::ACCESS_READ); } cv::Mat Component::get_dynamic_energy_array() { - return this->array_dynamic_energy_; +#ifdef TRACK_STATISTICS + array_dynamic_energy_.copyTo(array_dynamic_energy_, this->internal_mask); +#endif + return this->array_dynamic_energy_.getMat(cv::ACCESS_READ); } cv::Mat Component::get_transistor_count_array() { - return this->array_transistor_count_; +#ifdef TRACK_STATISTICS + array_transistor_count_.setTo(transistor_count_, this->internal_mask); +#endif +#ifdef USE_CUDA + cv::Mat m; + this->array_transistor_count_.download(m); + return m; +#else + return this->array_transistor_count_.getMat(cv::ACCESS_READ); +#endif } int Component::get_transistor_count() { @@ -66,6 +109,22 @@ double Component::get_height() { return this->height_; } +double Component::scale_speed(double base) { + double sf = 7.6 - 0.961 * log(5.531 * config_->get_clock_rate() - 104.456); + return base * sf; +} + +double Component::scale_width(double base) { + // Scale with process node + double sf = (double) config_->get_process_node() / this->process_node_; + return base * sf; +} + +double Component::scale_height(double base) { + double sf = (double) config_->get_process_node() / this->process_node_; + return base * sf; +} + int Component::calc_transistor_count() { return 0; } @@ -86,22 +145,6 @@ double Component::calc_height() { return 0; } -double Component::scale_speed(double base) { - double sf = 7.6 - 0.961 * log(5.531 * config_->get_clock_rate() - 104.456); - return base * sf; -} - -double Component::scale_width(double base) { - // Scale with process node - double sf = (double) config_->get_process_node() / this->process_node_; - return base * sf; -} - -double Component::scale_height(double base) { - double sf = (double) config_->get_process_node() / this->process_node_; - return base * sf; -} - #endif RTTR_REGISTRATION { diff --git a/src/simulator/base/opencv_wrappers.cpp b/src/simulator/base/opencv_wrappers.cpp new file mode 100644 index 0000000..ae522d8 --- /dev/null +++ b/src/simulator/base/opencv_wrappers.cpp @@ -0,0 +1,22 @@ +// +// Created by jm1417 on 29/05/2021. +// + +#include "simulator/base/opencv_wrappers.h" +#include + +void ocv_wrappers::arith::add(cv::OutputArray dst, cv::InputArray src1, cv::InputArray src2) { +#ifdef USE_CUDA + cv::cuda::add(src1, src2, dst); +#else + cv::add(src1, src2, dst); +#endif +} + +void ocv_wrappers::arith::add(cv::OutputArray dst, cv::InputArray src1, cv::InputArray src2, cv::InputArray mask) { +#ifdef USE_CUDA + cv::cuda::add(src1, src2, dst, mask); +#else + cv::add(src1, src2, dst, mask); +#endif +} diff --git a/src/simulator/base/pixel.cpp b/src/simulator/base/pixel.cpp index 12a63af..54b6769 100644 --- a/src/simulator/base/pixel.cpp +++ b/src/simulator/base/pixel.cpp @@ -12,6 +12,7 @@ #include #include +#include "simulator/base/opencv_wrappers.h" Pixel::Pixel(int rows, int cols, int row_stride, int col_stride, Source src, const std::string& path, std::shared_ptr config) { @@ -27,17 +28,7 @@ Pixel::Pixel(int rows, int cols, int row_stride, int col_stride, Source src, con void Pixel::init() { #ifdef TRACK_STATISTICS process_node_ = 180; - transistor_count_ = calc_transistor_count(); - static_power_ = calc_static(); - dynamic_power_ = calc_dynamic(); - width_ = calc_width(); - height_ = calc_height(); - internal_mask = cv::Mat(rows_, cols_, CV_8U, cv::Scalar(0)); - array_transistor_count_ = cv::Mat(rows_, cols_, CV_32S, cv::Scalar(0)); - array_static_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - array_dynamic_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - - this->calc_internal_mask(); + Component::init(); #endif } @@ -68,14 +59,17 @@ void Pixel::set_src(Source src) { void Pixel::reset() { input_source->reset(); } -cv::Mat Pixel::read() { - cv::Mat m = input_source->read(); +#ifdef USE_CUDA +cv::cuda::GpuMat& Pixel::read() { +#else +cv::UMat& Pixel::read() { +#endif #ifdef TRACK_STATISTICS double seconds = this->input_source->last_frame_time(); cycle_count_ = seconds * this->config_->get_clock_rate(); - cv::add(this->array_dynamic_energy_, this->dynamic_power_, this->array_dynamic_energy_, this->internal_mask); + ocv_wrappers::arith::add(this->array_dynamic_energy_, this->dynamic_power_, this->array_dynamic_energy_, this->internal_mask); #endif - return m; + return input_source->read(); } void Pixel::read(Register& reg) { @@ -83,7 +77,7 @@ void Pixel::read(Register& reg) { #ifdef TRACK_STATISTICS double seconds = this->input_source->last_frame_time(); cycle_count_ = seconds * this->config_->get_clock_rate(); - cv::add(this->array_dynamic_energy_, this->dynamic_power_, this->array_dynamic_energy_, this->internal_mask); + ocv_wrappers::arith::add(this->array_dynamic_energy_, this->dynamic_power_, this->array_dynamic_energy_, this->internal_mask); #endif } @@ -98,7 +92,7 @@ int Pixel::get_cycle_count() { } void Pixel::update_static(double time) { - cv::add(this->array_static_energy_, this->static_power_ * time, this->array_static_energy_, this->internal_mask); + ocv_wrappers::arith::add(this->array_static_energy_, this->static_power_ * time, this->array_static_energy_, this->internal_mask); } void Pixel::print_stats(const CycleCounter& counter) { diff --git a/src/simulator/buses/analogue_bus.cpp b/src/simulator/buses/analogue_bus.cpp index a62b612..efd57a3 100644 --- a/src/simulator/buses/analogue_bus.cpp +++ b/src/simulator/buses/analogue_bus.cpp @@ -6,6 +6,7 @@ #include #include +#include #include void AnalogueBus::bus(AnalogueRegister &a, DigitalRegister &FLAG) { @@ -16,55 +17,55 @@ void AnalogueBus::bus(AnalogueRegister &a, DigitalRegister &FLAG) { void AnalogueBus::bus(AnalogueRegister &a, AnalogueRegister &a0, DigitalRegister &FLAG) { // a = -a0 + error - cv::Mat &src = a0.read(); - cv::Mat &dst = a.read(); - cv::Mat &mask = FLAG.read(); - cv::bitwise_not(src, dst, mask); - cv::add(dst, 1, dst, mask); //todo counts +#ifdef USE_CUDA + cv::cuda::subtract(0, a0.read(), a.read(), FLAG.read()); +#else + cv::subtract(0, a0.read(), a.read(), FLAG.read()); +#endif } void AnalogueBus::bus(AnalogueRegister &a, AnalogueRegister &a0, AnalogueRegister &a1, DigitalRegister &FLAG) { // a = -(a0 + a1) + error - cv::Mat &src_1 = a0.read(); - cv::Mat &src_2 = a1.read(); - cv::Mat &dst = a.read(); - cv::Mat &mask = FLAG.read(); - cv::add(src_1, src_2, scratch, mask); - cv::bitwise_not(scratch, dst, mask); - cv::add(dst, 1, dst, mask); +#ifdef USE_CUDA + cv::cuda::add(a0.read(), a1.read(), scratch, FLAG.read()); + cv::cuda::subtract(0, scratch, a.read(), FLAG.read()); +#else + cv::add(a0.read(), a1.read(), scratch, FLAG.read()); + cv::subtract(0, scratch, a.read(), FLAG.read()); +#endif } void AnalogueBus::bus(AnalogueRegister &a, AnalogueRegister &a0, AnalogueRegister &a1, AnalogueRegister &a2, DigitalRegister &FLAG) { // a = -(a0 + a1 + a2) + error - cv::Mat &src_1 = a0.read(); - cv::Mat &src_2 = a1.read(); - cv::Mat &src_3 = a2.read(); - cv::Mat &dst = a.read(); - cv::Mat &mask = FLAG.read(); - cv::add(src_1, src_2, scratch, mask); - cv::add(scratch, src_3, scratch, mask); - cv::bitwise_not(scratch, dst, mask); - cv::add(dst, 1, dst, mask); +#ifdef USE_CUDA + cv::cuda::add(a0.read(), a1.read(), scratch, FLAG.read()); + cv::cuda::add(scratch, a2.read(), scratch, FLAG.read()); + cv::cuda::subtract(0, scratch, a.read(), FLAG.read()); +#else + cv::add(a0.read(), a1.read(), scratch, FLAG.read()); + cv::add(scratch, a2.read(), scratch, FLAG.read()); + cv::subtract(0, scratch, a.read(), FLAG.read()); +#endif } void AnalogueBus::bus(AnalogueRegister &a, AnalogueRegister &a0, AnalogueRegister &a1, AnalogueRegister &a2, AnalogueRegister &a3, DigitalRegister &FLAG) { // a = -(a0 + a1 + a2 + a3) + error - cv::Mat &src_1 = a0.read(); - cv::Mat &src_2 = a1.read(); - cv::Mat &src_3 = a2.read(); - cv::Mat &src_4 = a3.read(); - cv::Mat &dst = a.read(); - cv::Mat &mask = FLAG.read(); - cv::add(src_1, src_2, scratch, mask); - cv::add(scratch, src_3, scratch, mask); - cv::add(scratch, src_4, scratch, mask); - cv::bitwise_not(scratch, dst, mask); - cv::add(dst, 1, dst, mask); +#ifdef USE_CUDA + cv::cuda::add( a0.read(), a1.read(), scratch, FLAG.read()); + cv::cuda::add(scratch, a2.read(), scratch, FLAG.read()); + cv::cuda::add(scratch, a3.read(), scratch, FLAG.read()); + cv::cuda::subtract(0, scratch, a.read(), FLAG.read()); +#else + cv::add( a0.read(), a1.read(), scratch, FLAG.read()); + cv::add(scratch, a2.read(), scratch, FLAG.read()); + cv::add(scratch, a3.read(), scratch, FLAG.read()); + cv::subtract(0, scratch, a.read(), FLAG.read()); +#endif } void AnalogueBus::bus2(AnalogueRegister &a, AnalogueRegister &b, @@ -77,9 +78,13 @@ void AnalogueBus::bus2(AnalogueRegister &a, AnalogueRegister &b, void AnalogueBus::bus2(AnalogueRegister &a, AnalogueRegister &b, AnalogueRegister &a0, DigitalRegister &FLAG) { // a,b = -0.5*a0 + error + noise +#ifdef USE_CUDA + cv::cuda::multiply(a0.read(), 0.5, scratch); + cv::cuda::subtract(0, scratch, scratch, FLAG.read()); +#else cv::multiply(a0.read(), 0.5, scratch); - cv::bitwise_not(scratch, scratch, FLAG.read()); - cv::add(scratch, 1, scratch, FLAG.read()); + cv::subtract(0, scratch, scratch, FLAG.read()); +#endif a.write(scratch, FLAG.read()); b.write(scratch, FLAG.read()); } @@ -88,10 +93,15 @@ void AnalogueBus::bus2(AnalogueRegister &a, AnalogueRegister &b, AnalogueRegister &a0, AnalogueRegister &a1, DigitalRegister &FLAG) { // a,b = -0.5*(a0 + a1) + error + noise +#ifdef USE_CUDA + cv::cuda::add(a0.read(), a1.read(), scratch, FLAG.read()); + cv::cuda::multiply(scratch, 0.5, scratch); + cv::cuda::subtract(0, scratch, scratch, FLAG.read()); +#else cv::add(a0.read(), a1.read(), scratch, FLAG.read()); cv::multiply(scratch, 0.5, scratch); - cv::bitwise_not(scratch, scratch, FLAG.read()); - cv::add(scratch, 1, scratch, FLAG.read()); + cv::subtract(0, scratch, scratch, FLAG.read()); +#endif a.write(scratch, FLAG.read()); b.write(scratch, FLAG.read()); } @@ -100,9 +110,13 @@ void AnalogueBus::bus3(AnalogueRegister &a, AnalogueRegister &b, AnalogueRegister &c, AnalogueRegister &a0, DigitalRegister &FLAG) { // a,b,c = -0.33*a0 + error + noise +#ifdef USE_CUDA + cv::cuda::multiply(0.333, a0.read(), scratch); + cv::cuda::subtract(0, scratch, scratch, FLAG.read()); +#else cv::multiply(0.333, a0.read(), scratch); - cv::bitwise_not(scratch, scratch, FLAG.read()); - cv::add(scratch, 1, scratch, FLAG.read()); + cv::subtract(0, scratch, scratch, FLAG.read()); +#endif a.write(scratch, FLAG.read()); b.write(scratch, FLAG.read()); c.write(scratch, FLAG.read()); @@ -111,7 +125,11 @@ void AnalogueBus::bus3(AnalogueRegister &a, AnalogueRegister &b, void AnalogueBus::conditional_positive_set(DigitalRegister &b, AnalogueRegister &a) { // b := 1 if a > 0 +#ifdef USE_CUDA + cv::cuda::threshold(a.read(), b.read(), 0, 1, cv::THRESH_BINARY); +#else cv::threshold(a.read(), b.read(), 0, 1, cv::THRESH_BINARY); +#endif b.read().convertTo(b.read(), CV_8U); } @@ -119,8 +137,13 @@ void AnalogueBus::conditional_positive_set(DigitalRegister &b, AnalogueRegister &a0, AnalogueRegister &a1) { // b := 1 if (a0 + a1) > 0. +#ifdef USE_CUDA + cv::cuda::add(a0.read(), a1.read(), scratch); + cv::cuda::threshold(scratch, b.read(), 0, 1, cv::THRESH_BINARY); +#else cv::add(a0.read(), a1.read(), scratch); cv::threshold(scratch, b.read(), 0, 1, cv::THRESH_BINARY); +#endif b.read().convertTo(b.read(), CV_8U); } @@ -129,9 +152,15 @@ void AnalogueBus::conditional_positive_set(DigitalRegister &b, AnalogueRegister &a1, AnalogueRegister &a2) { // b := 1 if (a0 + a1 + a2) > 0. +#ifdef USE_CUDA + cv::cuda::add(a0.read(), a1.read(), scratch); + cv::cuda::add(scratch, a2.read(), scratch); + cv::cuda::threshold(scratch, b.read(), 0, 1, cv::THRESH_BINARY); +#else cv::add(a0.read(), a1.read(), scratch); cv::add(scratch, a2.read(), scratch); - threshold(scratch, b.read(), 0, 1, cv::THRESH_BINARY); + cv::threshold(scratch, b.read(), 0, 1, cv::THRESH_BINARY); +#endif b.read().convertTo(b.read(), CV_8U); } @@ -363,9 +392,16 @@ void AnalogueBus::scan(uint8_t *dst, AnalogueRegister &src, uint8_t row_start, row_step, col_step); int buf_index = 0; + cv::Mat m; +#ifdef USE_CUDA + src.read().download(m); +#else + m = src.read().getMat(cv::ACCESS_READ); +#endif + for(int col = p.col_start; p.col_op(col, p.col_end); col += p.col_step) { for(int row = p.row_start; p.row_op(row, p.row_end); row += p.row_step) { - dst[buf_index++] = src.read().at(row, col); + dst[buf_index++] = m.at(row, col); } } } diff --git a/src/simulator/buses/digital_bus.cpp b/src/simulator/buses/digital_bus.cpp index 917896e..3c9ef7d 100644 --- a/src/simulator/buses/digital_bus.cpp +++ b/src/simulator/buses/digital_bus.cpp @@ -3,6 +3,7 @@ // #include "simulator/buses/digital_bus.h" +#include #include @@ -11,7 +12,11 @@ void DigitalBus::OR(DigitalRegister &d, DigitalRegister &d0, // d := d0 OR d1 // TODO this still isn't enough because the write to d is missed // TODO Wrapper around all the opencv functions? +#ifdef USE_CUDA + cv::cuda::bitwise_or(d0.read(), d1.read(), d.read(), d.get_mask()); +#else cv::bitwise_or(d0.read(), d1.read(), d.read(), d.get_mask()); +#endif #ifdef TRACK_STATISTICS d.inc_write(d.get_mask()); d0.inc_read(); @@ -22,8 +27,13 @@ void DigitalBus::OR(DigitalRegister &d, DigitalRegister &d0, void DigitalBus::OR(DigitalRegister &d, DigitalRegister &d0, DigitalRegister &d1, DigitalRegister &d2) { // d := d0 OR d1 OR d2 +#ifdef USE_CUDA + cv::cuda::bitwise_or(d0.read(), d1.read(), d0.read()); + cv::cuda::bitwise_or(d0.read(), d2.read(), d.read(), d.get_mask()); +#else cv::bitwise_or(d0.read(), d1.read(), d0.read()); cv::bitwise_or(d0.read(), d2.read(), d.read(), d.get_mask()); +#endif #ifdef TRACK_STATISTICS d.inc_write(d.get_mask()); d0.inc_read(); @@ -36,9 +46,15 @@ void DigitalBus::OR(DigitalRegister &d, DigitalRegister &d0, DigitalRegister &d1, DigitalRegister &d2, DigitalRegister &d3) { // d := d0 OR d1 OR d2 OR d3 +#ifdef USE_CUDA + cv::cuda::bitwise_or(d0.read(), d1.read(), d0.read()); + cv::cuda::bitwise_or(d0.read(), d2.read(), d0.read()); + cv::cuda::bitwise_or(d0.read(), d3.read(), d.read(), d.get_mask()); +#else cv::bitwise_or(d0.read(), d1.read(), d0.read()); cv::bitwise_or(d0.read(), d2.read(), d0.read()); cv::bitwise_or(d0.read(), d3.read(), d.read(), d.get_mask()); +#endif #ifdef TRACK_STATISTICS d.inc_write(d.get_mask()); d0.inc_read(); @@ -50,7 +66,12 @@ void DigitalBus::OR(DigitalRegister &d, DigitalRegister &d0, void DigitalBus::NOT(DigitalRegister &d, DigitalRegister &d0) { // d := NOT d0 +#ifdef USE_CUDA + cv::cuda::bitwise_xor(d0.read(), 1, d.read(), d.get_mask()); +#else cv::bitwise_xor(d0.read(), 1, d.read(), d.get_mask()); +#endif + #ifdef TRACK_STATISTICS d.inc_write(d.get_mask()); d0.inc_write(); @@ -99,7 +120,11 @@ void DigitalBus::NOR(DigitalRegister &d, DigitalRegister &d0, void DigitalBus::NOT(DigitalRegister &Rl) { // Rl := NOT Rl +#ifdef USE_CUDA + cv::cuda::bitwise_xor(Rl.read(), 1, Rl.read(), Rl.get_mask()); +#else cv::bitwise_xor(Rl.read(), 1, Rl.read(), Rl.get_mask()); +#endif #ifdef TRACK_STATISTICS Rl.inc_write(Rl.get_mask()); Rl.inc_read(); @@ -129,7 +154,11 @@ void DigitalBus::NOR(DigitalRegister &Rl, DigitalRegister &Rx) { void DigitalBus::AND(DigitalRegister &Ra, DigitalRegister &Rx, DigitalRegister &Ry) { // Ra := Rx AND Ry +#ifdef USE_CUDA + cv::cuda::bitwise_and(Rx.read(), Ry.read(), Ra.read(), Ra.get_mask()); +#else cv::bitwise_and(Rx.read(), Ry.read(), Ra.read(), Ra.get_mask()); +#endif #ifdef TRACK_STATISTICS Ra.inc_write(Ra.get_mask()); Rx.inc_read(); @@ -184,7 +213,11 @@ void DigitalBus::NIMP(DigitalRegister &Rl, DigitalRegister &Rx, void DigitalBus::XOR(DigitalRegister &Rl, DigitalRegister &Rx, DigitalRegister &Ry) { // Rl := Rx XOR Ry +#ifdef USE_CUDA + cv::cuda::bitwise_xor(Rx.read(), Ry.read(), Rl.read(), Rl.get_mask()); +#else cv::bitwise_xor(Rx.read(), Ry.read(), Rl.read(), Rl.get_mask()); +#endif #ifdef TRACK_STATISTICS Rl.inc_write(); Rx.inc_read(); @@ -194,7 +227,15 @@ void DigitalBus::XOR(DigitalRegister &Rl, DigitalRegister &Rx, void DigitalBus::MOV(DigitalRegister &d, DigitalRegister &d0) { // d := d0 - cv::copyTo(d0.read(), d.read(), d.get_mask()); +#ifdef USE_CUDA + if (!d.get_mask().empty()) { + d0.read().copyTo(d.read(), d.get_mask()); + } else { + d0.read().copyTo(d.read()); + } +#else + d0.read().copyTo(d.read(), d.get_mask()); +#endif #ifdef TRACK_STATISTICS d.inc_write(d.get_mask()); d0.inc_read(); @@ -207,7 +248,11 @@ void DigitalBus::MUX(DigitalRegister &Rl, DigitalRegister &Rx, // R1 = (Ry.~Rx) + (Rz.Rx) DigitalRegister intermediate(Rl.read().rows, Rl.read().cols); DigitalRegister intermediate2(Rl.read().rows, Rl.read().cols); +#ifdef USE_CUDA + cv::cuda::bitwise_not(Rx.read(), intermediate.read()); +#else cv::bitwise_not(Rx.read(), intermediate.read()); +#endif DigitalBus::AND(intermediate2, Ry, intermediate); DigitalBus::AND(intermediate, Rz, Rx); DigitalBus::OR(Rl, intermediate, intermediate2); @@ -237,24 +282,39 @@ void DigitalBus::CLR_IF(DigitalRegister &Rl, DigitalRegister &Rx) { void DigitalBus::OR_MASKED(DigitalRegister &d, DigitalRegister &d0, DigitalRegister &d1, DigitalRegister &FLAG) { // d := d0 OR d1 +#ifdef USE_CUDA + cv::cuda::bitwise_or(d0.read(), d1.read(), d.read(), FLAG.read()); +#else cv::bitwise_or(d0.read(), d1.read(), d.read(), FLAG.read()); +#endif } void DigitalBus::OR_MASKED(DigitalRegister &d, DigitalRegister &d0, DigitalRegister &d1, DigitalRegister &d2, DigitalRegister &FLAG) { // d := d0 OR d1 OR d2 +#ifdef USE_CUDA + cv::cuda::bitwise_or(d0.read(), d1.read(), d0.read(), FLAG.read()); + cv::cuda::bitwise_or(d0.read(), d2.read(), d.read(), FLAG.read()); +#else cv::bitwise_or(d0.read(), d1.read(), d0.read(), FLAG.read()); cv::bitwise_or(d0.read(), d2.read(), d.read(), FLAG.read()); +#endif } void DigitalBus::OR_MASKED(DigitalRegister &d, DigitalRegister &d0, DigitalRegister &d1, DigitalRegister &d2, DigitalRegister &d3, DigitalRegister &FLAG) { // d := d0 OR d1 OR d2 OR d3 +#ifdef USE_CUDA + cv::cuda::bitwise_or(d0.read(), d1.read(), d0.read(), FLAG.read()); + cv::cuda::bitwise_or(d0.read(), d2.read(), d0.read(), FLAG.read()); + cv::cuda::bitwise_or(d0.read(), d3.read(), d.read(), FLAG.read()); +#else cv::bitwise_or(d0.read(), d1.read(), d0.read(), FLAG.read()); cv::bitwise_or(d0.read(), d2.read(), d0.read(), FLAG.read()); cv::bitwise_or(d0.read(), d3.read(), d.read(), FLAG.read()); +#endif } void DigitalBus::NOT_MASKED(DigitalRegister &d, DigitalRegister &d0, @@ -288,7 +348,11 @@ void DigitalBus::NOR_MASKED(DigitalRegister &d, DigitalRegister &d0, void DigitalBus::NOT_MASKED(DigitalRegister &Rl, DigitalRegister &FLAG) { // Rl := NOT Rl +#ifdef USE_CUDA + cv::cuda::bitwise_xor(Rl.read(), 1, Rl.read(), FLAG.read()); +#else cv::bitwise_xor(Rl.read(), 1, Rl.read(), FLAG.read()); +#endif } void DigitalBus::OR_MASKED(DigitalRegister &Rl, DigitalRegister &Rx, @@ -306,7 +370,11 @@ void DigitalBus::NOR_MASKED(DigitalRegister &Rl, DigitalRegister &Rx, void DigitalBus::AND_MASKED(DigitalRegister &Ra, DigitalRegister &Rx, DigitalRegister &Ry, DigitalRegister &FLAG) { // Ra := Rx AND Ry +#ifdef USE_CUDA + cv::cuda::bitwise_and(Rx.read(), Ry.read(), Ra.read(), FLAG.read()); +#else cv::bitwise_and(Rx.read(), Ry.read(), Ra.read(), FLAG.read()); +#endif } void DigitalBus::NAND_MASKED(DigitalRegister &Ra, DigitalRegister &Rx, @@ -341,7 +409,11 @@ void DigitalBus::NIMP_MASKED(DigitalRegister &Rl, DigitalRegister &Rx, void DigitalBus::XOR_MASKED(DigitalRegister &Rl, DigitalRegister &Rx, DigitalRegister &Ry, DigitalRegister &FLAG) { // Rl := Rx XOR Ry +#ifdef USE_CUDA + cv::cuda::bitwise_xor(Rx.read(), Ry.read(), Rl.read(), FLAG.read()); +#else cv::bitwise_xor(Rx.read(), Ry.read(), Rl.read(), FLAG.read()); +#endif } void DigitalBus::MOV_MASKED(DigitalRegister &d, DigitalRegister &d0, @@ -357,7 +429,11 @@ void DigitalBus::MUX_MASKED(DigitalRegister &Rl, DigitalRegister &Rx, // R1 = (Ry.~Rx) + (Rz.Rx) DigitalRegister intermediate(Rl.read().rows, Rl.read().cols); DigitalRegister intermediate2(Rl.read().rows, Rl.read().cols); +#ifdef USE_CUDA + cv::cuda::bitwise_not(Rx.read(), intermediate.read()); +#else cv::bitwise_not(Rx.read(), intermediate.read()); +#endif DigitalBus::AND_MASKED(intermediate2, Ry, intermediate, FLAG); DigitalBus::AND_MASKED(intermediate, Rz, Rx, FLAG); DigitalBus::OR_MASKED(Rl, intermediate, intermediate2, FLAG); @@ -380,9 +456,16 @@ void DigitalBus::get_up(DigitalRegister &dst, DigitalRegister &src, int offset, cv::Rect(0, 0, src.read().cols, src.read().rows - offset); auto write_chunk = cv::Rect(0, offset, src.read().cols, src.read().rows - offset); - src.read()(read_chunk).copyTo(dst.read()(write_chunk), dst.get_mask()); - auto fill = cv::Rect(0, 0, src.read().cols, offset); - dst.read()(fill).setTo(cv::Scalar(boundary_fill), dst.get_mask()); + if (!dst.get_mask().empty()) { + src.read()(read_chunk).copyTo(dst.read()(write_chunk), dst.get_mask()); + auto fill = cv::Rect(0, 0, src.read().cols, offset); + dst.read()(fill).setTo(cv::Scalar(boundary_fill), dst.get_mask()); + } else { + src.read()(read_chunk).copyTo(dst.read()(write_chunk)); + auto fill = cv::Rect(0, 0, src.read().cols, offset); + dst.read()(fill).setTo(cv::Scalar(boundary_fill)); + } + #ifdef TRACK_STATISTICS dst.inc_write(dst.get_mask()); src.inc_read(); @@ -396,10 +479,18 @@ void DigitalBus::get_right(DigitalRegister &dst, DigitalRegister &src, cv::Rect(offset, 0, src.read().cols - offset, src.read().rows); auto write_chunk = cv::Rect(0, 0, src.read().cols - offset, src.read().rows); - src.read()(read_chunk).copyTo(dst.read()(write_chunk)); - auto fill = - cv::Rect(src.read().cols - offset, 0, offset, src.read().rows); - dst.read()(fill).setTo(cv::Scalar(boundary_fill), dst.get_mask()); + if (!dst.get_mask().empty()) { + src.read()(read_chunk).copyTo(dst.read()(write_chunk), dst.get_mask()); + auto fill = + cv::Rect(src.read().cols - offset, 0, offset, src.read().rows); + dst.read()(fill).setTo(cv::Scalar(boundary_fill), dst.get_mask()); + } else { + src.read()(read_chunk).copyTo(dst.read()(write_chunk)); + auto fill = + cv::Rect(src.read().cols - offset, 0, offset, src.read().rows); + dst.read()(fill).setTo(cv::Scalar(boundary_fill)); + } + #ifdef TRACK_STATISTICS dst.inc_write(dst.get_mask()); src.inc_read(); @@ -413,9 +504,15 @@ void DigitalBus::get_left(DigitalRegister &dst, DigitalRegister &src, cv::Rect(0, 0, src.read().cols - offset, src.read().rows); auto write_chunk = cv::Rect(offset, 0, src.read().cols - offset, src.read().rows); - src.read()(read_chunk).copyTo(dst.read()(write_chunk), dst.get_mask()); - auto fill = cv::Rect(0, 0, offset, src.read().rows); - dst.read()(fill).setTo(cv::Scalar(boundary_fill), dst.get_mask()); + if (!dst.get_mask().empty()) { + src.read()(read_chunk).copyTo(dst.read()(write_chunk), dst.get_mask()); + auto fill = cv::Rect(0, 0, offset, src.read().rows); + dst.read()(fill).setTo(cv::Scalar(boundary_fill), dst.get_mask()); + } else { + src.read()(read_chunk).copyTo(dst.read()(write_chunk)); + auto fill = cv::Rect(0, 0, offset, src.read().rows); + dst.read()(fill).setTo(cv::Scalar(boundary_fill)); + } #ifdef TRACK_STATISTICS dst.inc_write(dst.get_mask()); src.inc_read(); @@ -429,10 +526,17 @@ void DigitalBus::get_down(DigitalRegister &dst, DigitalRegister &src, cv::Rect(0, offset, src.read().cols, src.read().rows - offset); auto write_chunk = cv::Rect(0, 0, src.read().cols, src.read().rows - offset); - src.read()(read_chunk).copyTo(dst.read()(write_chunk), dst.get_mask()); - auto fill = - cv::Rect(0, src.read().rows - offset, src.read().cols, offset); - dst.read()(fill).setTo(cv::Scalar(boundary_fill), dst.get_mask()); + if (!dst.get_mask().empty()) { + src.read()(read_chunk).copyTo(dst.read()(write_chunk), dst.get_mask()); + auto fill = + cv::Rect(0, src.read().rows - offset, src.read().cols, offset); + dst.read()(fill).setTo(cv::Scalar(boundary_fill), dst.get_mask()); + } else{ + src.read()(read_chunk).copyTo(dst.read()(write_chunk)); + auto fill = + cv::Rect(0, src.read().rows - offset, src.read().cols, offset); + dst.read()(fill).setTo(cv::Scalar(boundary_fill)); + } #ifdef TRACK_STATISTICS dst.inc_write(dst.get_mask()); src.inc_read(); diff --git a/src/simulator/external/parser.cpp b/src/simulator/external/parser.cpp index 1988f1a..d6c072b 100644 --- a/src/simulator/external/parser.cpp +++ b/src/simulator/external/parser.cpp @@ -4,6 +4,7 @@ #include "simulator/external/parser.h" +#include #include #include #include @@ -16,8 +17,15 @@ #include #include #include +#include #include +using std::chrono::high_resolution_clock; +using std::chrono::duration_cast; +using std::chrono::duration; +using std::chrono::microseconds; +using std::chrono::milliseconds; + Parser::Parser() { enums_ = get_enums(); } @@ -198,16 +206,19 @@ Instructions Parser::parse_instructions(rttr::instance class_obj, std::ifstream& void Parser::execute_instructions(const Instructions& parsed, rttr::instance instance) { for (auto& [method, args]: parsed) { rttr::variant res; - switch (args.size()) { - case 0: res = method.invoke(instance); break; - case 1: res = method.invoke(instance, args[0]); break; - case 2: res = method.invoke(instance, args[0], args[1]); break; - case 3: res = method.invoke(instance, args[0], args[1], args[2]); break; - case 4: res = method.invoke(instance, args[0], args[1], args[2], args[3]); break; - case 5: res = method.invoke(instance, args[0], args[1], args[2], args[3], args[4]); break; - case 6: res = method.invoke(instance, args[0], args[1], args[2], args[3], args[4], args[5]); break; - default: std::cerr << "Too many arguments in method " << method.get_name() << " invocation" << std::endl; + for (int i = 0; i < repeat_; i++) { + switch (args.size()) { + case 0: res = method.invoke(instance); break; + case 1: res = method.invoke(instance, args[0]); break; + case 2: res = method.invoke(instance, args[0], args[1]); break; + case 3: res = method.invoke(instance, args[0], args[1], args[2]); break; + case 4: res = method.invoke(instance, args[0], args[1], args[2], args[3]); break; + case 5: res = method.invoke(instance, args[0], args[1], args[2], args[3], args[4]); break; + case 6: res = method.invoke(instance, args[0], args[1], args[2], args[3], args[4], args[5]); break; + default: std::cerr << "Too many arguments in method " << method.get_name() << " invocation" << std::endl; + } } + if (!res.is_valid()) { std::cerr << "Could not execute method \"" << method.get_name() << "\"" << " with " << args.size() << " arguments" << std::endl; @@ -320,6 +331,9 @@ rttr::variant Parser::create_instance(const std::string& arch_name, json arch_pr void Parser::parse_config(std::ifstream& config, std::ifstream& program) { json c = json::parse(config); + + setup_processing(c); + std::vector enums = get_enums(); // all registered enums // Create architecture by using builder @@ -370,14 +384,22 @@ void Parser::parse_config(std::ifstream& config, std::ifstream& program) { bool loop = frames < 0; + cv::TickMeter tm; + int i = 0; + double current_frame_time_avg = 0; while (loop || i < frames) { - int e1 = cv::getTickCount(); + i++; +// auto t1 = high_resolution_clock::now(); + tm.start(); Parser::execute_instructions(instructions, arch); - if (frame_time) { - int e2 = cv::getTickCount(); - std::cout << ((double)(e2 - e1) / cv::getTickFrequency()) * 1000 << " ms" << std::endl; +// auto t2 = high_resolution_clock::now(); +// duration ms_double = t2 - t1; +// std::cout << ms_double.count() << " microseconds\n"; + tm.stop(); + std::cout << tm.getTimeMilli() << " ms\n"; + tm.reset(); } if (ui_enabled) { @@ -395,7 +417,6 @@ void Parser::parse_config(std::ifstream& config, std::ifstream& program) { std::cout << "Frame: " << i << std::endl; } } - i++; } @@ -415,3 +436,55 @@ void Parser::parse_config(std::ifstream& config, std::ifstream& program) { // arch_builder_type.destroy(arch_builder); arch.get_type().destroy(arch); } + + +void Parser::setup_processing(json& j) { + + if (j.contains("instr_rep")) { + repeat_ = j["instr_rep"].get(); + } + +#ifdef USE_CUDA + // no OpenCL processing if we're using CUDA + std::cout << "Processing on GPU using CUDA" << std::endl; + cv::ocl::setUseOpenCL(false); + return; +#else + bool use_opencl = false; + if (j.contains("use_opencl")) { + use_opencl = j["use_opencl"].get(); + } + + if (!use_opencl) { + std::cout << "Processing on CPU" << std::endl; + cv::ocl::setUseOpenCL(false); + } else { + cv::ocl::setUseOpenCL(true); + + if (!cv::ocl::haveOpenCL()) + { + std::cout << "OpenCL is not available" << std::endl; + return; + } + + cv::ocl::Context context; + if (!context.create(cv::ocl::Device::TYPE_GPU)) + { + std::cout << "Failed to create GPU Context" << std::endl; + //return; + } + + std::cout << context.ndevices() << " GPU device(s) detected." << std::endl; + for (size_t i = 0; i < context.ndevices(); i++) + { + cv::ocl::Device device = context.device(i); + std::cout << "name: " << device.name() << std::endl; + std::cout << "available: " << device.available() << std::endl; + std::cout << "imageSupport: " << device.imageSupport() << std::endl; + std::cout << "OpenCL_C_Version: " << device.OpenCL_C_Version() << std::endl; + std::cout << std::endl; + } + std::cout << "Processing on GPU using OpenCL" << std::endl; + } +#endif +} diff --git a/src/simulator/input/image_input.cpp b/src/simulator/input/image_input.cpp index aeca013..26aed1b 100644 --- a/src/simulator/input/image_input.cpp +++ b/src/simulator/input/image_input.cpp @@ -25,8 +25,11 @@ ImageInput::ImageInput(int rows, int cols, const std::string &path) std::string t_s = utility::opencv_type_to_str(img.type()); std::cout << "Image is of type: " << t_s << std::endl; - - this->frame = cv::Mat(rows, cols, MAT_TYPE); +#ifdef USE_CUDA + this->frame = cv::cuda::GpuMat(rows, cols, MAT_TYPE); +#else + this->frame = cv::UMat(rows, cols, MAT_TYPE); +#endif this->frame.setTo(0); } @@ -34,7 +37,12 @@ void ImageInput::read(Register ®) { reg.write(this->read()); } -cv::Mat ImageInput::read() { +#ifdef USE_CUDA +cv::cuda::GpuMat& ImageInput::read() { +#else +cv::UMat& ImageInput::read() { +#endif + cv::Mat temp(rows_, cols_, CV_32S); auto TIME_START = std::chrono::high_resolution_clock::now(); cv::Mat img = cv::imread(this->path_, cv::IMREAD_GRAYSCALE); @@ -45,7 +53,13 @@ cv::Mat ImageInput::read() { cv::resize(img, img, {cols_, rows_}); - img.convertTo(this->frame, MAT_TYPE, 1, -128); + img.convertTo(temp, MAT_TYPE, 1, -128); + +#ifdef USE_CUDA + this->frame.upload(temp); +#else + temp.copyTo(this->frame); +#endif auto TIME_END = std::chrono::high_resolution_clock::now(); long time_in_nano = std::chrono::duration_cast( diff --git a/src/simulator/input/live_input.cpp b/src/simulator/input/live_input.cpp index ca4d6ad..0371718 100644 --- a/src/simulator/input/live_input.cpp +++ b/src/simulator/input/live_input.cpp @@ -13,7 +13,7 @@ LiveInput::LiveInput(int rows, int cols, int camera_index) { this->rows_ = rows; this->cols_ = cols; - std::cout << "Using camera: " << camera_index << "\n"; + std::cout << "Using camera index: " << camera_index << "\n"; this->capture = std::make_unique(camera_index); if(!this->capture->isOpened()) { std::cerr << "Could not open camera" << std::endl; @@ -21,11 +21,19 @@ LiveInput::LiveInput(int rows, int cols, int camera_index) { } this->size = std::make_unique(cols, rows); - this->frame = cv::Mat(rows, cols, MAT_TYPE); +#ifdef USE_CUDA + this->frame = cv::cuda::GpuMat(rows, cols, MAT_TYPE); +#else + this->frame = cv::UMat(rows, cols, MAT_TYPE); +#endif this->frame.setTo(0); } -cv::Mat LiveInput::read() { +#ifdef USE_CUDA +cv::cuda::GpuMat& LiveInput::read() { +#else +cv::UMat& LiveInput::read() { +#endif #ifdef USE_RUNTIME_CHECKS if(this->capture == nullptr) { std::cerr << "No video capture defined" << std::endl; @@ -34,6 +42,7 @@ cv::Mat LiveInput::read() { cv::Mat temp(rows_, cols_, CV_32S); auto TIME_START = std::chrono::high_resolution_clock::now(); *this->capture >> temp; + #ifdef USE_RUNTIME_CHECKS if(temp.empty()) { std::cerr << "ERROR! blank frame grabbed" << std::endl; @@ -43,11 +52,17 @@ cv::Mat LiveInput::read() { int width = temp.cols; int height = temp.rows; - cv::Mat cropFrame = - temp(cv::Rect((width - height) / 2, 0, height - 1, height - 1)); + cv::Mat cropFrame = temp(cv::Rect((width - height) / 2, 0, height - 1, height - 1)); cv::resize(cropFrame, cropFrame, *this->size); cropFrame.convertTo(temp, MAT_TYPE, 1, -128); +#ifdef USE_CUDA + cv::Mat a; + this->frame.download(a); + cv::add(a, temp, a); + this->frame.upload(a); +#else cv::add(this->frame, temp, this->frame); +#endif auto TIME_END = std::chrono::high_resolution_clock::now(); long time_in_nano = std::chrono::duration_cast( TIME_END - TIME_START) diff --git a/src/simulator/input/video_input.cpp b/src/simulator/input/video_input.cpp index 056f3d2..e3c00f5 100644 --- a/src/simulator/input/video_input.cpp +++ b/src/simulator/input/video_input.cpp @@ -17,11 +17,19 @@ VideoInput::VideoInput(int rows, int cols, const std::string &path) { exit(1); } this->size = std::make_unique(cols, rows); - this->frame = cv::Mat(rows, cols, MAT_TYPE); +#ifdef USE_CUDA + this->frame = cv::cuda::GpuMat(rows, cols, MAT_TYPE); +#else + this->frame = cv::UMat(rows, cols, MAT_TYPE); +#endif this->frame.setTo(0); } -cv::Mat VideoInput::read() { +#ifdef USE_CUDA +cv::cuda::GpuMat& VideoInput::read() { +#else +cv::UMat& VideoInput::read() { +#endif LiveInput::read(); frame_count++; if(frame_count == this->capture->get(cv::CAP_PROP_FRAME_COUNT)) { diff --git a/src/simulator/memory/dram/dram3t_cell.cpp b/src/simulator/memory/dram/dram3t_cell.cpp index bf55f1d..349320a 100644 --- a/src/simulator/memory/dram/dram3t_cell.cpp +++ b/src/simulator/memory/dram/dram3t_cell.cpp @@ -12,22 +12,7 @@ void Dram3tCell::init() { process_node_ = 180; - internal_mask = cv::Mat(rows_, cols_, CV_8U, cv::Scalar(0)); -#ifdef TRACK_STATISTICS - transistor_count_ = calc_transistor_count(); - static_power_ = calc_static(); - dynamic_read_power_ = calc_dynamic_read(); - dynamic_write_power_ = calc_dynamic_write(); - dynamic_power_ = calc_dynamic(); - width_ = calc_width(); - height_ = calc_height(); - time_ = this->cycle_count_ * (1.0/config_->get_clock_rate()); - scratch = cv::Mat(rows_, cols_, CV_8U, cv::Scalar(0)); - array_transistor_count_ = cv::Mat(rows_, cols_, CV_32S, cv::Scalar(0)); - array_static_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - array_dynamic_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - this->calc_internal_mask(); -#endif + Memory::init(); } #ifdef TRACK_STATISTICS @@ -55,34 +40,6 @@ int Dram3tCell::get_cycle_count() { return cycle_count_; } -void Dram3tCell::update_static(double time) { - cv::add(this->array_static_energy_, this->static_power_ * time, this->array_static_energy_, this->internal_mask); -} - -void Dram3tCell::print_stats(const CycleCounter& counter) { - std::cout << "TODO: Implement in DRAM3TCELL" << std::endl; -} - -void Dram3tCell::read(const cv::_InputOutputArray& mask) { - scratch = 0; - cv::bitwise_and(this->internal_mask, mask, scratch); - cv::add(this->array_dynamic_energy_, this->dynamic_read_power_ * time_, this->array_dynamic_energy_, scratch); -} - -void Dram3tCell::read() { - read_count_++; -} - -void Dram3tCell::write(const cv::_InputOutputArray& mask) { - scratch = 0; - cv::bitwise_and(this->internal_mask, mask, scratch); - cv::add(this->array_dynamic_energy_, this->dynamic_write_power_ * time_, this->array_dynamic_energy_, scratch); -} - -void Dram3tCell::write() { - write_count_++; -} - int Dram3tCell::calc_transistor_count() { return 3; } @@ -91,12 +48,4 @@ double Dram3tCell::calc_dynamic() { return calc_dynamic_read() + calc_dynamic_write(); } -cv::Mat Dram3tCell::get_dynamic_energy_array() { - cv::add(this->array_dynamic_energy_, read_count_ * this->dynamic_read_power_ * time_, this->array_dynamic_energy_, internal_mask); - cv::add(this->array_dynamic_energy_, write_count_ * this->dynamic_write_power_ * time_, this->array_dynamic_energy_, internal_mask); - - return Component::get_dynamic_energy_array(); -} - #endif - diff --git a/src/simulator/memory/dram/dram_array.cpp b/src/simulator/memory/dram/dram_array.cpp index 9740e05..9456340 100644 --- a/src/simulator/memory/dram/dram_array.cpp +++ b/src/simulator/memory/dram/dram_array.cpp @@ -8,17 +8,8 @@ void Dram::init() { #ifdef TRACK_STATISTICS - transistor_count_ = calc_transistor_count(); - static_power_ = calc_static(); - dynamic_power_ = calc_dynamic(); - width_ = calc_width(); - height_ = calc_height(); time_ = this->cycle_count_ * (1.0 / config_->get_clock_rate()); - internal_mask = cv::Mat(rows_, cols_, CV_8U, cv::Scalar(0)); - array_transistor_count_ = cv::Mat(rows_, cols_, CV_32S, cv::Scalar(0)); - array_static_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - array_dynamic_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - this->calc_internal_mask(); + Component::init(); #endif // dims = (arrays x rows_in_array x cols_in_row) int sizes[] = {((rows_ * cols_) / (row_stride_ * col_stride_)), array_rows_, array_cols_}; @@ -115,7 +106,7 @@ void Dram::print_row(int array, int row) { std::cout << std::endl; } -void Dram::reset() { +void Dram::reset_val() { data = 0; } @@ -151,11 +142,13 @@ double Dram::calc_dynamic() { } double Dram::calc_width() { - return array_cols_ / 15; + // say 2/bit + some extra for other circuitry + return array_cols_ * 2 + 10; } double Dram::calc_height() { - return array_rows_ / 15; + // say 2/row + some extra for other circuitry + return array_rows_ * 2 + 10; } void Dram::update_static(double time) { diff --git a/src/simulator/memory/memory.cpp b/src/simulator/memory/memory.cpp index 4d174bb..0a25e19 100644 --- a/src/simulator/memory/memory.cpp +++ b/src/simulator/memory/memory.cpp @@ -8,9 +8,71 @@ #include #include +#include + +void Memory::init() { +#ifdef TRACK_STATISTICS + dynamic_read_power_ = calc_dynamic_read(); + dynamic_write_power_ = calc_dynamic_write(); + time_ = this->cycle_count_ * (1.0 / config_->get_clock_rate()); +#ifdef USE_CUDA + scratch = cv::cuda::GpuMat(rows_, cols_, CV_8U, cv::Scalar(0)); +#else + scratch = cv::UMat(rows_, cols_, CV_8U, cv::Scalar(0)); +#endif + Component::init(); +#endif +} + +#ifdef TRACK_STATISTICS +int Memory::get_cycle_count() { + return cycle_count_; +} + +void Memory::update_static(double time) { + cv::add(this->array_static_energy_, this->static_power_ * time, this->array_static_energy_, this->internal_mask); +} + +void Memory::print_stats(const CycleCounter& counter) { + std::cerr << "TODO: Implement in memory.cpp" << std::endl; +} + +cv::Mat Memory::get_dynamic_energy_array() { + // add all the accesses with no masks + cv::add(this->array_dynamic_energy_, read_count_ * this->dynamic_read_power_ * time_, this->array_dynamic_energy_, internal_mask); + cv::add(this->array_dynamic_energy_, write_count_ * this->dynamic_write_power_ * time_, this->array_dynamic_energy_, internal_mask); + + return Component::get_dynamic_energy_array(); +} + +void Memory::read(const cv::_InputOutputArray& mask) { +#ifdef USE_CUDA + cv::cuda::add(this->array_dynamic_energy_, this->dynamic_read_power_ * time_, this->array_dynamic_energy_, mask); +#else + cv::add(this->array_dynamic_energy_, this->dynamic_read_power_ * time_, this->array_dynamic_energy_, mask); +#endif +} + +void Memory::read() { + read_count_++; +} + +void Memory::write(const cv::_InputOutputArray& mask) { +#ifdef USE_CUDA + cv::cuda::add(this->array_dynamic_energy_, this->dynamic_write_power_ * time_, this->array_dynamic_energy_, mask); +#else + cv::add(this->array_dynamic_energy_, this->dynamic_write_power_ * time_, this->array_dynamic_energy_, mask); +#endif +} + +void Memory::write() { + write_count_++; +} +#endif + std::shared_ptr Memory::construct(MemoryType memory_type, int rows, int cols, int row_stride, int col_stride, const std::shared_ptr& config) { std::shared_ptr memory; - switch(memory_type) { + switch (memory_type) { case DRAM3T: { memory = std::make_shared(); break; @@ -32,4 +94,3 @@ std::shared_ptr Memory::construct(MemoryType memory_type, int rows, int memory->init(); return memory; } - diff --git a/src/simulator/memory/si/si_cell.cpp b/src/simulator/memory/si/si_cell.cpp index afab954..d339472 100644 --- a/src/simulator/memory/si/si_cell.cpp +++ b/src/simulator/memory/si/si_cell.cpp @@ -10,22 +10,7 @@ void SiCell::init() { process_node_ = 180; - internal_mask = cv::Mat(rows_, cols_, CV_8U, cv::Scalar(0)); -#ifdef TRACK_STATISTICS - transistor_count_ = calc_transistor_count(); - static_power_ = calc_static(); - dynamic_read_power_ = calc_dynamic_read(); - dynamic_write_power_ = calc_dynamic_write(); - dynamic_power_ = calc_dynamic(); - width_ = calc_width(); - height_ = calc_height(); - time_ = this->cycle_count_ * (1.0/config_->get_clock_rate()); - scratch = cv::Mat(rows_, cols_, CV_8U, cv::Scalar(0)); - array_transistor_count_ = cv::Mat(rows_, cols_, CV_32S, cv::Scalar(0)); - array_static_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - array_dynamic_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - this->calc_internal_mask(); -#endif + Memory::init(); } #ifdef TRACK_STATISTICS @@ -59,43 +44,4 @@ double SiCell::calc_height() { return this->scale_width(base); } -int SiCell::get_cycle_count() { - return cycle_count_; -} - -void SiCell::read(const cv::_InputOutputArray& mask) { - scratch = 0; - cv::bitwise_and(this->internal_mask, mask, scratch); - cv::add(this->array_dynamic_energy_, this->dynamic_read_power_ * time_, this->array_dynamic_energy_, scratch); -} - -void SiCell::read() { - read_count_++; -} - -void SiCell::write(const cv::_InputOutputArray& mask) { - scratch = 0; - cv::bitwise_and(this->internal_mask, mask, scratch); - cv::add(this->array_dynamic_energy_, this->dynamic_write_power_ * time_, this->array_dynamic_energy_, scratch); -} - -void SiCell::write() { - write_count_++; -} - -void SiCell::update_static(double time) { - cv::add(this->array_static_energy_, this->static_power_ * time, this->array_static_energy_, this->internal_mask); -} - -void SiCell::print_stats(const CycleCounter& counter) { - std::cout << "TODO: Implement in SICELL" << std::endl; -} - -cv::Mat SiCell::get_dynamic_energy_array() { - cv::add(this->array_dynamic_energy_, read_count_ * this->dynamic_read_power_ * time_, this->array_dynamic_energy_, internal_mask); - cv::add(this->array_dynamic_energy_, write_count_ * this->dynamic_write_power_ * time_, this->array_dynamic_energy_, internal_mask); - - return Component::get_dynamic_energy_array(); -} - #endif diff --git a/src/simulator/memory/sram/sram6t_cell.cpp b/src/simulator/memory/sram/sram6t_cell.cpp index 93a786c..f5950a2 100644 --- a/src/simulator/memory/sram/sram6t_cell.cpp +++ b/src/simulator/memory/sram/sram6t_cell.cpp @@ -10,22 +10,7 @@ void Sram6tCell::init() { process_node_ = 180; - internal_mask = cv::Mat(rows_, cols_, CV_8U, cv::Scalar(0)); -#ifdef TRACK_STATISTICS - transistor_count_ = calc_transistor_count(); - static_power_ = calc_static(); - dynamic_read_power_ = calc_dynamic_read(); - dynamic_write_power_ = calc_dynamic_write(); - dynamic_power_ = calc_dynamic(); - width_ = calc_width(); - height_ = calc_height(); - time_ = this->cycle_count_ * (1.0/config_->get_clock_rate()); - scratch = cv::Mat(rows_, cols_, CV_8U, cv::Scalar(0)); - array_transistor_count_ = cv::Mat(rows_, cols_, CV_32S, cv::Scalar(0)); - array_static_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - array_dynamic_energy_ = cv::Mat(rows_, cols_, CV_64F, cv::Scalar(0)); - this->calc_internal_mask(); -#endif + Memory::init(); } #ifdef TRACK_STATISTICS @@ -54,34 +39,6 @@ int Sram6tCell::get_cycle_count() { return cycle_count_; } -void Sram6tCell::read(const cv::_InputOutputArray& mask) { - scratch = 0; - cv::bitwise_and(this->internal_mask, mask, scratch); - cv::add(this->array_dynamic_energy_, this->dynamic_read_power_ * time_, this->array_dynamic_energy_, scratch); -} - -void Sram6tCell::read() { - read_count_++; -} - -void Sram6tCell::write(const cv::_InputOutputArray& mask) { - scratch = 0; - cv::bitwise_and(this->internal_mask, mask, scratch); - cv::add(this->array_dynamic_energy_, this->dynamic_write_power_ * time_, this->array_dynamic_energy_, scratch); -} - -void Sram6tCell::write() { - write_count_++; -} - -void Sram6tCell::update_static(double time) { - cv::add(this->array_static_energy_, this->static_power_ * time, this->array_static_energy_, this->internal_mask); -} - -void Sram6tCell::print_stats(const CycleCounter& counter) { - std::cout << "TODO: Implement in SRAM6TCELL" << std::endl; -} - int Sram6tCell::calc_transistor_count() { return 6; } @@ -90,10 +47,4 @@ double Sram6tCell::calc_dynamic() { return calc_dynamic_read() + calc_dynamic_write(); } -cv::Mat Sram6tCell::get_dynamic_energy_array() { - cv::add(this->array_dynamic_energy_, read_count_ * this->dynamic_read_power_ * time_, this->array_dynamic_energy_, internal_mask); - cv::add(this->array_dynamic_energy_, write_count_ * this->dynamic_write_power_ * time_, this->array_dynamic_energy_, internal_mask); - return Component::get_dynamic_energy_array(); -} - #endif diff --git a/src/simulator/metrics/packer.cpp b/src/simulator/metrics/packer.cpp index c8c9991..ccdcc43 100644 --- a/src/simulator/metrics/packer.cpp +++ b/src/simulator/metrics/packer.cpp @@ -61,6 +61,8 @@ std::shared_ptr Packer::split_node(std::shared_ptr node, dou } std::shared_ptr Packer::pack(std::vector>& components) { +#ifdef TRACK_STATISTICS + if (components.empty()) { return std::make_shared(); } @@ -84,4 +86,5 @@ std::shared_ptr Packer::pack(std::vector>& } return this->root_; +#endif } diff --git a/src/simulator/registers/analogue_register.cpp b/src/simulator/registers/analogue_register.cpp index f26b39f..8bf195f 100644 --- a/src/simulator/registers/analogue_register.cpp +++ b/src/simulator/registers/analogue_register.cpp @@ -31,6 +31,22 @@ AnalogueRegister::AnalogueRegister(int rows, int cols, int row_stride, int col_s this->max_val = 127; } +#ifdef USE_CUDA +AnalogueRegister::AnalogueRegister(const cv::cuda::GpuMat &data, int row_stride, int col_stride) { +#else +AnalogueRegister::AnalogueRegister(const cv::UMat &data, int row_stride, int col_stride) { +#endif + this->rows_ = data.rows; + this->cols_ = data.cols; + this->row_stride_ = row_stride; + this->col_stride_ = col_stride; + this->type_ = MAT_TYPE; + Register::init(); + this->min_val = -128; + this->max_val = 127; + this->write(data); +} + AnalogueRegister::AnalogueRegister(const cv::Mat &data, int row_stride, int col_stride) { this->rows_ = data.rows; this->cols_ = data.cols; diff --git a/src/simulator/registers/digital_register.cpp b/src/simulator/registers/digital_register.cpp index 35fd0b2..548e040 100644 --- a/src/simulator/registers/digital_register.cpp +++ b/src/simulator/registers/digital_register.cpp @@ -30,6 +30,22 @@ DigitalRegister::DigitalRegister(int rows, int cols, int row_stride, int col_str this->max_val = 1; } +#ifdef USE_CUDA +DigitalRegister::DigitalRegister(const cv::cuda::GpuMat &data, int row_stride, int col_stride) { +#else +DigitalRegister::DigitalRegister(const cv::UMat &data, int row_stride, int col_stride) { +#endif + this->rows_ = data.rows; + this->cols_ = data.cols; + this->row_stride_ = row_stride; + this->col_stride_ = col_stride; + this->type_ = CV_8U; + Register::init(); + this->min_val = 0; + this->max_val = 1; + this->write(data); +} + DigitalRegister::DigitalRegister(const cv::Mat &data, int row_stride, int col_stride) { this->rows_ = data.rows; this->cols_ = data.cols; @@ -48,10 +64,18 @@ DigitalRegister &DigitalRegister::operator()(const std::string &name) { } void DigitalRegister::set_mask(const std::shared_ptr& mask) { - this->mask_ = std::make_shared(mask->read()); +#ifdef USE_CUDA + this->mask_ = std::make_shared(mask->read()); +#else + this->mask_ = std::make_shared(mask->read()); +#endif } -cv::Mat& DigitalRegister::get_mask() { +#ifdef USE_CUDA +cv::cuda::GpuMat& DigitalRegister::get_mask() { +#else +cv::UMat& DigitalRegister::get_mask() { +#endif return *mask_; } diff --git a/src/simulator/registers/register.cpp b/src/simulator/registers/register.cpp index 853b170..f11ce9f 100644 --- a/src/simulator/registers/register.cpp +++ b/src/simulator/registers/register.cpp @@ -8,24 +8,48 @@ void Register::init() { - this->value_ = cv::Mat(rows_, cols_, type_, cv::Scalar(0)); +#ifdef USE_CUDA + this->value_ = cv::cuda::GpuMat(rows_, cols_, type_, cv::Scalar(0)); +#else + this->value_ = cv::UMat(rows_, cols_, type_, cv::Scalar(0)); +#endif + Component::init(); } -cv::Mat &Register::read() { +#ifdef USE_CUDA +cv::cuda::GpuMat& Register::read() { +#else +cv::UMat& Register::read() { +#endif #ifdef TRACK_STATISTICS this->inc_read(); #endif return this->value_; } -void Register::write(cv::Mat &data) { +#ifdef USE_CUDA +void Register::write(cv::cuda::GpuMat& data) { +#else +void Register::write(cv::UMat& data) { +#endif + data.copyTo(this->value_); +#ifdef TRACK_STATISTICS + this->inc_write(); +#endif +} + +#ifdef USE_CUDA +void Register::write(const cv::cuda::GpuMat& data) { +#else +void Register::write(const cv::UMat& data) { +#endif data.copyTo(this->value_); #ifdef TRACK_STATISTICS this->inc_write(); #endif } -void Register::write(const cv::Mat &data) { +void Register::write(const cv::Mat& data) { data.copyTo(this->value_); #ifdef TRACK_STATISTICS this->inc_write(); @@ -36,7 +60,22 @@ void Register::write(Register &data) { this->write(data.read()); } -void Register::write(cv::Mat &data, cv::Mat &mask) { +#ifdef USE_CUDA +void Register::write(cv::cuda::GpuMat& data, cv::cuda::GpuMat &mask) { +#else +void Register::write(cv::UMat& data, cv::UMat &mask) { +#endif + data.copyTo(this->value_, mask); +#ifdef TRACK_STATISTICS + this->inc_write(mask); +#endif +} + +#ifdef USE_CUDA +void Register::write(cv::cuda::GpuMat& data, cv::Mat &mask) { +#else +void Register::write(cv::UMat& data, cv::Mat &mask) { +#endif data.copyTo(this->value_, mask); #ifdef TRACK_STATISTICS this->inc_write(mask); @@ -54,7 +93,11 @@ void Register::write(int data) { #endif } -void Register::write(int data, cv::Mat &mask) { +#ifdef USE_CUDA +void Register::write(int data, cv::cuda::GpuMat &mask) { +#else +void Register::write(int data, cv::UMat &mask) { +#endif this->value_.setTo(data, mask); #ifdef TRACK_STATISTICS this->inc_write(mask); @@ -158,6 +201,7 @@ int Register::get_transistor_count() { } } + #endif diff --git a/src/simulator/util/utility.cpp b/src/simulator/util/utility.cpp index cc37370..5e05710 100644 --- a/src/simulator/util/utility.cpp +++ b/src/simulator/util/utility.cpp @@ -7,7 +7,15 @@ #include void utility::remap_register(Register ®, cv::Mat &dst) { - reg.read().convertTo(dst, CV_8U, 255.0 / (reg.max_val - reg.min_val), + + cv::UMat m; +#ifdef USE_CUDA + reg.read().download(m); +#else + m = reg.read(); +#endif + + m.convertTo(dst, CV_8U, 255.0 / (reg.max_val - reg.min_val), -reg.min_val * 255.0 / (reg.max_val - reg.min_val)); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 6975422..58c6216 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,6 +1,6 @@ set(TARGET_NAME tests) -add_executable(${TARGET_NAME} main.cpp utility.cpp utility.h buses/analogue_bus_test.cpp buses/digital_bus_test.cpp scamp5/scamp5_test.cpp ../scamp5/scamp5.cpp ../scamp5/scamp5.h ../scamp5_extended/scamp5_e.h ../scamp5_extended/scamp5_e.cpp scamp5e/scamp5e_test.cpp ../src/simulator/registers/register.cpp ../src/simulator/registers/analogue_register.cpp ../src/simulator/registers/digital_register.cpp ../src/simulator/buses/analogue_bus.cpp ../src/simulator/buses/digital_bus.cpp ../src/simulator/base/architecture.cpp ../src/simulator/base/pixel.cpp ../src/simulator/pe/processing_element.cpp ../src/simulator/memory/dram/dram_array.cpp ../src/simulator/util/utility.cpp ../src/simulator/base/config.cpp ../src/simulator/metrics/cycle_counter.cpp ../src/simulator/memory/memory.cpp ../src/simulator/input/live_input.cpp ../src/simulator/input/image_input.cpp ../src/simulator/input/video_input.cpp ../src/simulator/base/plane_params.cpp ../src/simulator/memory/sram/sram6t_cell.cpp ../src/simulator/memory/dram/dram3t_cell.cpp ../src/simulator/memory/si/si_cell.cpp ../src/simulator/external/parser.cpp ../src/simulator/ui/ui.cpp ../src/simulator/ui/file_watcher.cpp ../src/simulator/ui/src/base64_encoder.cpp memory/dram_array_test.cpp ../scamp5/news_t.h ../src/simulator/base/component.cpp ../src/simulator/metrics/packer.cpp ../src/simulator/metrics/pack_node.cpp) +add_executable(${TARGET_NAME} main.cpp utility.cpp utility.h buses/analogue_bus_test.cpp buses/digital_bus_test.cpp scamp5/scamp5_test.cpp ../scamp5/scamp5.cpp ../scamp5/scamp5.h ../scamp5_extended/scamp5_e.h ../scamp5_extended/scamp5_e.cpp scamp5e/scamp5e_test.cpp ../src/simulator/registers/register.cpp ../src/simulator/registers/analogue_register.cpp ../src/simulator/registers/digital_register.cpp ../src/simulator/buses/analogue_bus.cpp ../src/simulator/buses/digital_bus.cpp ../src/simulator/base/architecture.cpp ../src/simulator/base/pixel.cpp ../src/simulator/pe/processing_element.cpp ../src/simulator/memory/dram/dram_array.cpp ../src/simulator/util/utility.cpp ../src/simulator/base/config.cpp ../src/simulator/metrics/cycle_counter.cpp ../src/simulator/memory/memory.cpp ../src/simulator/input/live_input.cpp ../src/simulator/input/image_input.cpp ../src/simulator/input/video_input.cpp ../src/simulator/base/plane_params.cpp ../src/simulator/memory/sram/sram6t_cell.cpp ../src/simulator/memory/dram/dram3t_cell.cpp ../src/simulator/memory/si/si_cell.cpp ../src/simulator/external/parser.cpp ../src/simulator/ui/ui.cpp ../src/simulator/ui/file_watcher.cpp ../src/simulator/ui/src/base64_encoder.cpp memory/dram_array_test.cpp ../scamp5/news_t.h ../src/simulator/base/component.cpp ../src/simulator/metrics/packer.cpp ../src/simulator/metrics/pack_node.cpp ../src/simulator/base/opencv_wrappers.cpp) target_link_libraries(${TARGET_NAME} PUBLIC ${CONAN_LIBS} ${OpenCV_LIBS})