OS API와 Overhead의 이해

Kernel space explained (Win32 kernel as example)

User-Kernel Boundary Crossing

Frequency of Calls and Work Done Per Call

Mitigations and Best Practices

syscall (Win32) frequently used

Relative Cost and Performance Considerations

Operation Time Scale
Single CPU instruction execution 0.1 to 1 nanosecond (ns)
Level 1 (L1) cache access 0.5 to 1 ns
Level 2 (L2) cache access 3 to 10 ns
Branch mispredict penalty 5 to 20 ns
Level 3 (L3) cache access 10 to 20 ns
Main memory (RAM) access (memory miss) 50 to 100 ns
Mutex lock/unlock 25 to 50 ns (but can vary widely with contention)
Solid-State Drive (SSD) access 50 to 150 microseconds (us)
Disk seek (Hard Drive) 1 to 10 milliseconds (ms) (varies by drive type and age)
Round-Trip Time (RTT) in the same data center 100 us to 1 ms
Sending 1KB packet over 1Gbps network 10 us (not including propagation delays)
Round-Trip Time (RTT) on the Internet (cross-country) 20 to 50 ms
Round-Trip Time (RTT) on the Internet (global/international) 100 to 300 ms
Disk read (CD-ROM, older media) 100 to 300 ms

Why Knowing API Overhead Is Important for Python Developers

API 최적화 실험

API콜을 최소화 하는 방법을 테스트 해보려고 합니다. server와 client 프로세스가 동일 host에서 통신을 하여 그 시간을 재는 실험입니다. 언어는 C, macos에서 테스트 했습니다.

한 쪽은 System V message queue (msgsnd, msgrcv) IPC로 통신을 하고 매 통신마다 syscall이 발생하게 됩니다.

다른 한 쪽은 shared memory로 메모리 생성 및 연결 시 에만 syscall이 발생하고 공유된 메모리 상 서로의 동기화 방법으로 메시지를 통신합니다.

각 client, server는 increment되는 sequence 번호를 통신 정보로 합니다.

#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>    // O_RDWR
#include <sys/mman.h> // shm_open, mmap
#include <sys/stat.h> // For mode constants
#include <unistd.h>   // ftruncate, close

#define SHM_NAME   "/my_shm_example"
#define SHM_SIZE   sizeof(SharedData)

typedef struct {
    volatile int request_count;
    volatile int total_requests;
    volatile int ready_flag;
    volatile int done_flag;
} SharedData;

int main(int argc, char *argv[]) {
    if (argc != 2) {
        fprintf(stderr, "Usage: %s <num_requests>\n", argv[0]);
        return 1;
    }
    int num_requests = atoi(argv[1]);
    if (num_requests <= 0) {
        fprintf(stderr, "Invalid number of requests.\n");
        return 1;
    }

    // Open existing shared memory
    int fd = shm_open(SHM_NAME, O_RDWR, 0666);
    if (fd < 0) {
        perror("shm_open");
        return 1;
    }

    void *addr = mmap(NULL, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (addr == MAP_FAILED) {
        perror("mmap");
        return 1;
    }
    close(fd);

    SharedData *shared = (SharedData *)addr;

    // Signal server that client is ready
    shared->ready_flag = 1;

    printf("[Client] Sending %d requests...\n", num_requests);
    for (int i = 1; i <= num_requests; i++) {
        shared->request_count = i;
        // Simulate some small delay
        usleep(10);
    }

    // Wait for server to finish
    while (!shared->done_flag) {
        usleep(10);
    }

    printf("[Client] All requests done.\n");

    munmap(addr, SHM_SIZE);
    return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>    // For O_* constants
#include <sys/mman.h> // shm_open, mmap
#include <sys/stat.h> // For mode constants
#include <unistd.h>   // ftruncate, close
#include <time.h>

#define SHM_NAME   "/my_shm_example"
#define SHM_SIZE   sizeof(SharedData)

typedef struct {
    volatile int request_count;   // Number of requests received
    volatile int total_requests;  // Target number of requests
    volatile int ready_flag;      // Indicates client has started
    volatile int done_flag;       // Indicates server has processed last request
} SharedData;

double get_time_seconds(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec + ts.tv_nsec / 1e9;
}

int main(int argc, char *argv[]) {
    if (argc != 2) {
        fprintf(stderr, "Usage: %s <num_requests>\n", argv[0]);
        return 1;
    }
    int num_requests = atoi(argv[1]);
    if (num_requests <= 0) {
        fprintf(stderr, "Invalid number of requests.\n");
        return 1;
    }

    // Create & initialize shared memory
    int fd = shm_open(SHM_NAME, O_CREAT | O_RDWR, 0666);
    if (fd < 0) {
        perror("shm_open");
        return 1;
    }
    if (ftruncate(fd, SHM_SIZE) < 0) {
        perror("ftruncate");
        return 1;
    }

    void *addr = mmap(NULL, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (addr == MAP_FAILED) {
        perror("mmap");
        return 1;
    }
    close(fd);

    SharedData *shared = (SharedData *)addr;
    shared->request_count = 0;
    shared->total_requests = num_requests;
    shared->ready_flag = 0;
    shared->done_flag = 0;

    printf("[Server] Waiting for client to start...\n");

    // Wait until client signals readiness
    while (!shared->ready_flag) {
        // busy-wait
        usleep(10);
    }

    printf("[Server] Client connected. Expecting %d requests.\n", num_requests);

    double t_start = 0;
    double t_end   = 0;
    int first_request_seen = 0;

    // Process requests
    while (1) {
        int current_count = shared->request_count;
        if (current_count > 0 && !first_request_seen) {
            t_start = get_time_seconds();
            first_request_seen = 1;
        }
        if (current_count >= num_requests) {
            t_end = get_time_seconds();
            break;
        }
        usleep(10); // small sleep to reduce CPU usage in busy loop
    }

    shared->done_flag = 1;

    double elapsed = t_end - t_start;
    printf("[Server] Received %d requests. Total time: %.6f seconds\n",
            num_requests, elapsed);

    // Cleanup
    munmap(addr, SHM_SIZE);
    shm_unlink(SHM_NAME); // remove shared memory

    return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <unistd.h>
#include <string.h>

#define MSG_KEY  0x1234
#define MSG_TYPE 1

typedef struct {
    long mtype;
    int  sequence;
} MsgData;

int main(int argc, char *argv[]) {
    if (argc != 2) {
        fprintf(stderr, "Usage: %s <num_requests>\n", argv[0]);
        return 1;
    }
    int num_requests = atoi(argv[1]);
    if (num_requests <= 0) {
        fprintf(stderr, "Invalid number of requests.\n");
        return 1;
    }

    int msqid = msgget(MSG_KEY, 0666);
    if (msqid == -1) {
        perror("msgget");
        return 1;
    }

    printf("[Client] Sending %d messages...\n", num_requests);

    for (int i = 1; i <= num_requests; i++) {
        MsgData msg;
        msg.mtype = MSG_TYPE;
        msg.sequence = i;
        if (msgsnd(msqid, &msg, sizeof(msg.sequence), 0) == -1) {
            perror("msgsnd");
            return 1;
        }
        // Some small delay
        usleep(1000);
    }

    printf("[Client] All messages sent.\n");
    return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <sys/ipc.h>
#include <sys/msg.h>
#include <time.h>
#include <unistd.h>
#include <string.h>

#define MSG_KEY     0x1234
#define MSG_TYPE    1

typedef struct {
    long mtype;
    int  sequence;
} MsgData;

double get_time_seconds(void) {
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec + ts.tv_nsec / 1e9;
}

int main(int argc, char *argv[]) {
    if (argc != 2) {
        fprintf(stderr, "Usage: %s <num_requests>\n", argv[0]);
        return 1;
    }
    int num_requests = atoi(argv[1]);
    if (num_requests <= 0) {
        fprintf(stderr, "Invalid number of requests.\n");
        return 1;
    }

    // Create message queue
    int msqid = msgget(MSG_KEY, IPC_CREAT | 0666);
    if (msqid == -1) {
        perror("msgget");
        return 1;
    }

    printf("[Server] Waiting for %d messages...\n", num_requests);

    double t_start = 0;
    double t_end   = 0;
    int first_msg_received = 0;
    int count = 0;

    while (count < num_requests) {
        MsgData msg;
        if (msgrcv(msqid, &msg, sizeof(msg.sequence), MSG_TYPE, 0) == -1) {
            perror("msgrcv");
            return 1;
        }

        if (!first_msg_received) {
            t_start = get_time_seconds();
            first_msg_received = 1;
        }

        count++;
    }

    t_end = get_time_seconds();
    double elapsed = t_end - t_start;
    printf("[Server] Received %d messages. Total time: %.6f seconds\n", num_requests, elapsed);

    // Cleanup
    if (msgctl(msqid, IPC_RMID, NULL) == -1) {
        perror("msgctl");
        return 1;
    }

    return 0;
}
CC = gcc
CFLAGS = -Wall -O2

all: shared_mem_server shared_mem_client ipc_server ipc_client

shared_mem_server: shared_mem_server.c
    $(CC) $(CFLAGS) -o shared_mem_server shared_mem_server.c

shared_mem_client: shared_mem_client.c
    $(CC) $(CFLAGS) -o shared_mem_client shared_mem_client.c

ipc_server: ipc_server.c
    $(CC) $(CFLAGS) -o ipc_server ipc_server.c

ipc_client: ipc_client.c
    $(CC) $(CFLAGS) -o ipc_client ipc_client.c

clean:
    rm -f shared_mem_server shared_mem_client ipc_server ipc_client

테스트 실행:

# 빌드
make

# ipc server
./ipc_server 10000 &

# ipc client
./ipc_client 10000

# shared memory server
./shared_mem_server 10000 &

# shared memory client
./shared_mem_client 10000

결과:

image.png

단적인 케이스이지만 shared memory사용 시 System V Message queue보다 약 85배 속도 개선이 있었습니다.

Summary

OS API 호출은 모드 스위치와 기타 비용 때문에 상대적으로 무겁습니다. 호출 횟수를 줄이거나 일괄 처리, 비동기 I/O 등을 활용하면 성능을 높일 수 있습니다. Python처럼 고수준 언어에서도 제대로 프로파일링하여 syscall 오버헤드를 줄이는 방법을 익혀둬야 합니다.