disassembler, assembler, bug fixes, and more

This commit is contained in:
Zoe
2025-03-07 15:05:22 +00:00
parent d64f63b165
commit 587fda2d49
14 changed files with 4307 additions and 1082 deletions

View File

@@ -0,0 +1,533 @@
#include <bits/stdc++.h>
#include <cstddef>
#include <cstdint>
#include <fcntl.h>
#include <format>
#include <queue>
#include <string>
#include <sys/types.h>
#include <unistd.h>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#define BYTECODE_READER_IMPLEMENTATION
#include "reader.hpp"
#include <cstdio>
struct label {
enum type {
type_instruction,
type_byte,
} type;
uint16_t length;
std::string name;
};
uint16_t *find_closest_label(const std::unordered_map<uint16_t, label> &map,
uint16_t target) {
uint16_t *closest_label = nullptr;
for (const auto &pair : map) {
if (pair.first <= target) {
if (closest_label != nullptr && pair.first < *closest_label) {
continue;
}
if (closest_label == nullptr) {
closest_label = (uint16_t *)calloc(1, sizeof(uint16_t));
if (closest_label == NULL) {
throw std::runtime_error("Failed to allocate memory");
}
}
*closest_label = pair.first;
} else {
break;
}
}
return closest_label;
}
std::string to_hex(size_t number) { return std::format("{:04x}", number); }
std::string
get_label_for_byte(uint16_t byte,
std::unordered_map<uint16_t, struct label> labels,
enum label::type target_type) {
std::string operand_str;
operand_str.append("0x");
operand_str.append(to_hex(byte));
if (labels.find(byte - 0x200) != labels.end()) {
operand_str.clear();
operand_str = labels.at(byte - 0x200).name;
} else {
// try to see if the operand is offset from a label
if (byte > 0x200) {
uint16_t *closest_label = find_closest_label(labels, byte - 0x200);
if (closest_label != nullptr &&
labels.at(*closest_label).type == target_type) {
uint16_t offset = (byte - 0x200) - (*closest_label);
// discard our result if the offset is greater than the
// length of the label
if (offset > labels.at(*closest_label).length) {
return operand_str;
}
operand_str.clear();
operand_str = labels.at(*closest_label).name;
operand_str.append(" + 0x");
operand_str.append(to_hex(offset));
operand_str.append("");
}
}
}
return operand_str;
}
// I could emit something like omni assembly, nut that is significantly more
// complex than just emitting the assembly like this, so I am just going to
// emit the assembly like this for now
void print_instruction(Bytecode bytecode, uint16_t pc,
std::unordered_map<uint16_t, struct label> labels) {
switch (bytecode.instruction_type) {
case HLT:
printf("halt\n");
break;
case EXIT:
printf("exit 0x%02x\n", bytecode.operand.byte);
break;
case SYS:
if (labels.find(bytecode.operand.word - 0x200) == labels.end()) {
fprintf(stderr, "No label found for %04x\n",
bytecode.operand.word - 0x200);
exit(1);
}
printf("sys %s\n",
labels.find(bytecode.operand.word - 0x200)->second.name.c_str());
break;
case CLS:
printf("cls\n");
break;
case RET:
printf("ret\n");
break;
case JP:
if (pc == 0 && bytecode.operand.word == 0x260) {
printf("hires\n");
break;
}
if (labels.find(bytecode.operand.word - 0x200) == labels.end()) {
fprintf(stderr, "No label found for %04x\n",
bytecode.operand.word - 0x200);
exit(1);
}
printf("jp %s\n",
labels.find(bytecode.operand.word - 0x200)->second.name.c_str());
break;
case CALL:
if (labels.find(bytecode.operand.word - 0x200) == labels.end()) {
fprintf(stderr, "No label found for %04x\n",
bytecode.operand.word - 0x200);
exit(1);
}
printf("call %s\n",
labels.find(bytecode.operand.word - 0x200)->second.name.c_str());
break;
case SKIP_INSTRUCTION_BYTE:
printf("se v%x, 0x%02x\n", bytecode.operand.byte_reg.reg,
bytecode.operand.byte_reg.byte);
break;
case SKIP_INSTRUCTION_NE_BYTE:
printf("sne v%x, 0x%02x\n", bytecode.operand.byte_reg.reg,
bytecode.operand.byte_reg.byte);
break;
case SKIP_INSTRUCTION_REG:
printf("se v%x, v%x\n", bytecode.operand.reg_reg.x,
bytecode.operand.reg_reg.y);
break;
case SKIP_INSTRUCTION_NE_REG:
printf("sne v%x, v%x\n", bytecode.operand.reg_reg.x,
bytecode.operand.reg_reg.y);
break;
case LOAD_BYTE: {
printf("ld v%x, 0x%02x\n", bytecode.operand.byte_reg.reg,
bytecode.operand.byte_reg.byte);
break;
}
case ADD_BYTE:
printf("add v%x, 0x%02x\n", bytecode.operand.byte_reg.reg,
bytecode.operand.byte_reg.byte);
break;
case LOAD_REG:
printf("ld v%x, v%x\n", bytecode.operand.reg_reg.x,
bytecode.operand.reg_reg.y);
break;
case ADD_REG:
printf("add v%x, v%x\n", bytecode.operand.reg_reg.x,
bytecode.operand.reg_reg.y);
break;
case OR_REG:
printf("or v%x, v%x\n", bytecode.operand.reg_reg.x,
bytecode.operand.reg_reg.y);
break;
case AND_REG:
printf("and v%x, v%x\n", bytecode.operand.reg_reg.x,
bytecode.operand.reg_reg.y);
break;
case XOR_REG:
printf("xor v%x, v%x\n", bytecode.operand.reg_reg.x,
bytecode.operand.reg_reg.y);
break;
case SHR_REG:
printf("shr v%x\n", bytecode.operand.reg_reg.x);
break;
case SUB_REG:
printf("sub v%x, v%x\n", bytecode.operand.reg_reg.x,
bytecode.operand.reg_reg.y);
break;
case SUBN_REG:
printf("subn v%x, v%x\n", bytecode.operand.reg_reg.x,
bytecode.operand.reg_reg.y);
break;
case SHL_REG:
printf("shl v%x\n", bytecode.operand.reg_reg.x);
break;
case LOAD_I_BYTE: {
printf("ld I, %s\n", get_label_for_byte(bytecode.operand.word, labels,
label::type_byte)
.c_str());
break;
}
case JP_V0_BYTE: {
printf("jp v0, %s\n", get_label_for_byte(bytecode.operand.word, labels,
label::type_instruction)
.c_str());
break;
}
case RND:
printf("rnd v%x, 0x%02x\n", bytecode.operand.byte_reg.reg,
bytecode.operand.byte_reg.byte);
break;
case DRW:
printf("drw v%x, v%x, 0x%x\n", bytecode.operand.reg_reg_nibble.x,
bytecode.operand.reg_reg_nibble.y,
bytecode.operand.reg_reg_nibble.nibble);
break;
case SKIP_PRESSED_REG:
printf("skp v%x\n", bytecode.operand.byte);
break;
case SKIP_NOT_PRESSED_REG:
printf("sknp v%x\n", bytecode.operand.byte);
break;
case LD_REG_DT:
printf("ld v%x, dt\n", bytecode.operand.byte);
break;
case LD_REG_K:
printf("ld v%x, k\n", bytecode.operand.byte);
break;
case LD_DT_REG:
printf("ld dt, v%x\n", bytecode.operand.byte);
break;
case LD_ST_REG:
printf("ld st, v%x\n", bytecode.operand.byte);
break;
case ADD_I_REG:
printf("add I, v%x\n", bytecode.operand.byte);
break;
case LD_F_REG:
printf("ld f, v%x\n", bytecode.operand.byte);
break;
case LD_B_REG:
printf("ld b, v%x\n", bytecode.operand.byte);
break;
case LD_PTR_I_REG:
printf("ld [I], v%x\n", bytecode.operand.byte);
break;
case LD_REG_PTR_I:
printf("ld v%x, [I]\n", bytecode.operand.byte);
break;
case UNKNOWN_INSTRUCTION:
printf("?\n");
}
}
struct hole {
uint16_t start;
uint16_t end;
};
struct assembly_node {
uint16_t address;
enum type {
type_byte,
type_instruction,
} type;
union {
uint8_t byte;
Bytecode instruction;
};
};
void write_assembly(std::vector<struct assembly_node> assembly,
std::unordered_map<uint16_t, struct label> labels) {
std::sort(assembly.begin(), assembly.end(),
[](const struct assembly_node &a, const struct assembly_node &b) {
return a.address < b.address;
});
uint16_t last_byte_address = 0x0000;
for (auto &node : assembly) {
if (node.type != assembly_node::type_byte && node.address != 0 &&
node.address - 1 == last_byte_address) {
printf("\n");
}
if (labels.find(node.address) != labels.end()) {
if (node.address != 0x0000) {
// add whitespacing between labels, but not for the _start label
printf("\n");
}
printf("%s:\n", labels.at(node.address).name.c_str());
}
switch (node.type) {
case assembly_node::type_byte:
if (node.address != last_byte_address + 1) {
printf("db ");
} else {
printf(",\n ");
}
printf("0x%02x", node.byte);
last_byte_address = node.address;
break;
case assembly_node::type_instruction:
// if previous assembly was a byte, then we need to emit a new line
print_instruction(node.instruction, node.address, labels);
break;
}
}
}
void disassemble(uint8_t *rom, int rom_size) {
// evaluate the bytecode, but dont actually execute it, just print it out,
// and when we reach a branching instruction, follow it. Make sure that if
// we enter an inifinite loop we dont just loop forever, so make sure we
// keep track of what we have already visited
std::unordered_set<uint16_t> addresses_visited;
std::queue<uint16_t> work_queue;
std::vector<struct hole> holes;
std::unordered_map<uint16_t, struct label> labels;
size_t label_idx = 0;
uint16_t *stack = (uint16_t *)calloc(16, sizeof(uint16_t));
if (stack == NULL) {
fprintf(stderr, "Failed to allocate stack!");
exit(1);
}
// holds the start of a label
size_t stack_idx = 0;
std::vector<struct assembly_node> assembly;
// start at the beginning of the rom
work_queue.push(0x0000);
labels.emplace(
0x0000,
label{.type = label::type_instruction, .length = 0, .name = "_start"});
while (!work_queue.empty()) {
uint16_t pc = work_queue.front();
work_queue.pop();
if (pc >= (uint16_t)rom_size) {
// if we are reading past the end of the rom, we are done
break;
}
if (addresses_visited.find(pc) != addresses_visited.end())
continue;
addresses_visited.insert(pc);
uint16_t opcode = (rom[pc] << 8) | rom[pc + 1];
Bytecode bytecode = parse(opcode);
assembly.push_back({.address = pc,
.type = assembly_node::type_instruction,
.instruction = bytecode});
switch (bytecode.instruction_type) {
case JP: {
if (pc == 0 && bytecode.operand.word == 0x260) {
work_queue.push(0x2C0);
break;
}
if (!labels.contains(bytecode.operand.word - 0x200)) {
labels.emplace(
bytecode.operand.word - 0x200,
label{.type = label::type_instruction,
.length = 0,
.name = "_" + std::to_string(label_idx++)});
}
work_queue.push(bytecode.operand.word - 0x200);
break;
}
case CALL: {
if (stack_idx == 16) {
fprintf(stderr, "Stack overflow!\n");
exit(1);
}
if (!labels.contains(bytecode.operand.word - 0x200)) {
labels.emplace(
bytecode.operand.word - 0x200,
label{.type = label::type_instruction,
.length = 0,
.name = "_" + std::to_string(label_idx++)});
}
stack[stack_idx++] = pc + 2;
work_queue.push(bytecode.operand.word - 0x200);
break;
}
case SKIP_INSTRUCTION_BYTE:
case SKIP_INSTRUCTION_NE_BYTE:
case SKIP_INSTRUCTION_REG:
case SKIP_INSTRUCTION_NE_REG:
case SKIP_PRESSED_REG:
case SKIP_NOT_PRESSED_REG: {
work_queue.push(pc + 2);
work_queue.push(pc + 4);
break;
}
case RET: {
if (stack_idx == 0) {
fprintf(stderr, "Stack underflow!\n");
exit(1);
}
uint16_t ret_pc = stack[--stack_idx];
work_queue.push(ret_pc);
break;
}
case HLT: { // Stop following
break;
}
case UNKNOWN_INSTRUCTION: {
fprintf(stderr, "Unknown instruction: %04x\n", opcode);
// we failed at disassembling smartly
break;
}
default:
work_queue.push(pc + 2);
}
}
bool skip = false;
uint16_t *last_seen_byte_array = nullptr;
uint16_t start_of_last_contiguous_block = 0x0000;
for (uint16_t pc = 0x00; pc < rom_size; pc++) {
if (skip) {
skip = false;
continue;
}
if (addresses_visited.find(pc) != addresses_visited.end()) {
// when there is an instruction that we have already visited, we
// want to skip this byte and the next byte, but we cant rely of the
// instructions being aligned to 0x02 bytes, so instead we tell the
// next run of the loop to skip
skip = true;
continue;
}
// this seems scary, but it's fine because the if block will jump down
// if the first condition is met, so it will never dereference a null
// pointer
if (last_seen_byte_array == nullptr ||
*last_seen_byte_array != pc - 1) {
if (last_seen_byte_array == nullptr) {
last_seen_byte_array = new uint16_t;
}
start_of_last_contiguous_block = pc;
// we are not in a contiguous block of bytes, so we need to add a
// label
if (!labels.contains(pc)) {
labels.emplace(
pc, label{.type = label::type_byte,
.length = 1,
.name = "_" + std::to_string(label_idx++)});
}
} else {
// we are in a contiguous block of bytes, so we need to update the
// label's length by one
labels[start_of_last_contiguous_block].length++;
}
*last_seen_byte_array = pc;
assembly.push_back(
{.address = pc, .type = assembly_node::type_byte, .byte = rom[pc]});
}
for (auto &pair : labels) {
uint16_t pc = pair.first;
uint16_t label_length = 0;
// while we havent reached the end of the rom, and we havent crossed
// into a new label
while (pc < rom_size) {
label_length++;
switch (pair.second.type) {
case label::type_byte:
pc++;
break;
case label::type_instruction:
pc += 2;
break;
}
if (labels.find(pc) != labels.end())
break;
}
pair.second.length = label_length;
}
write_assembly(assembly, labels);
}
int main(int argc, char **argv) {
if (argc < 2) {
printf("Usage: %s <file>\n", argv[0]);
return 1;
}
int rom_fd = open(argv[1], O_RDONLY);
if (rom_fd < 0) {
fprintf(stderr, "Failed to open file: %s\n", argv[1]);
return 1;
}
int rom_size = lseek(rom_fd, 0, SEEK_END);
(void)lseek(rom_fd, 0, SEEK_SET);
uint8_t *rom = (uint8_t *)calloc(rom_size, sizeof(uint8_t));
if (rom == NULL) {
fprintf(stderr, "Failed to allocate memory!\n");
return 1;
}
read(rom_fd, rom, rom_size);
disassemble(rom, rom_size);
return 0;
}