Open-Source Internship opportunity by OpenGenus for programmers. Apply now.
In this article, we have used different system calls (syscalls) in order to reverse a large file (>1GB) using a chunk size rather than character by character. We have demonstrated this with a C code.
SYSTEM CALLS USED
- open(): Open the file at path
Path
with the flagsflags
and permissionsmode
and returns the file descriptor that points to that file in the file table entry. In case of any error, it returns -1.
int open (const char* Path, int flags [, int mode ]);
- close() Close the file. It returns 0 once the file is closed successfully and -1 in case of any error.
int close (int filedes);
- lseek(): repositions the file offset of the file descriptor
fd
tooffset
bytes with respect towhence
directive(SEEK_SET
,SEEK_CUR
,SEEK_END
) and returns the new location of the file offset of the file descriptor
off_t lseek(int fd, off_t offset, int whence);
- read(): Read data upto
count
bytes from a file whose file descriptor isfd
and store the results inbuf
pointer. It returns the number of bytes read. In case of any error, it returns -1.
ssize_t read(int fd, void *buf, size_t count);
- write(): Read data upto
count
bytes from the memory location pointed bybuf
to a file whose file descriptor isfd
. It returns the number of bytes read. In case of any error, it returns -1.
ssize_t write(int fd, const void *buf, size_t count);
- fflush(): Clear (or flush) the output buffer and move the buffered data to the output stream.
fflush(FILE *ostream);
EXPLANATION OF THE CODE
STEP 1: READING THE FILES
Here, the input file is "input.txt" and the output file is "output.txt". The input file contains the data to be reversed, while the output file will contain the reversed data. In case of any errors, then an error message is displayed.
int source_file = open("input.txt", O_RDONLY);
off_t fileLength = lseek(source_file, 0, SEEK_END);
int dest_file = open("output.txt", O_CREAT | O_RDWR | O_TRUNC, 0600);
if (source_file < 0 || dest_file < 0 || fileLength <= 0)
{
write(2, "Error in file operations", 25);
exit(1);
}
STEP 2: REPOSITION THE FILE DESCRIPTOR OFFSETS
Here, we would reposition the file descriptor offsets as follows:
- move the destination file offset to beginning of file
- move the source file offset to the end of file
lseek(dest_file, 0, SEEK_SET);
lseek(source_file, -1, SEEK_END);
STEP 3: SET THE CHUNK SIZE
Here, we would set the chunk size as the power of 2 that is closest to 0.2% of size of the whole file. This chunk size is stored in no_of_bytes_per_read
.
int pref_byte_size = fileLength / 500;
int no_of_bytes_per_read = (pref_byte_size == 0) ? 1 : pow(2, (int)(log((double)(pref_byte_size)) / log(2.0)));
STEP 4: DEFINE THE CHARACTER POINTERS TO STORE THE READ CHUNK AND THE REVERSED CHUNK
Here, c
denotes the chunk of bytes that would be read and r
denotes the reverse of c
.
char *c, *r;
c = (char *)malloc(no_of_bytes_per_read);
r = (char *)malloc(no_of_bytes_per_read);
STEP 5: SET START LOCATION AND THE INITIAL LENGTH TO MOVE
Here, the start
location is set as the starting location of the last chunk.
For example, if the source_file
contains 0123456789
and the chunk size is 3, then the different chunks would be 012 345 678 9
. Here, start would point to the index of 9.
Also, the length_to_move
is set as the size of the chunk that would be read. (in the example above, 1)
int start = ((fileLength - 1) / no_of_bytes_per_read) * no_of_bytes_per_read;
int length_to_move = fileLength - start;
lseek(source_file, start, SEEK_SET);
STEP 6: THE MAIN LOOP
while (1)
{
// STEP 6.1: Read `length_to_move` bytes from `source_file` and store in `c`
printf("\nLocation: %lld\tLength read: %d\t", lseek(source_file, 0, SEEK_CUR), length_to_move);
read(source_file, c, length_to_move);
printf("Read %s\t", c);
// STEP 6.2: Reverse the contents of `c` and store in `r`
for (int begin = 0, end = length_to_move - 1; begin < length_to_move; begin++, end--)
r[begin] = c[end];
r[length_to_move] = '\0';
// STEP 6.3: Write `r` to the `dest_file`
printf("Write: %s\t", r);
fflush(stdout);
write(dest_file, r, length_to_move);
// STEP 6.4: Check if there are any more chunks to read
if (lseek(source_file, 0, SEEK_CUR) - no_of_bytes_per_read - length_to_move < 0)
break;
// STEP 6.5: Move the `source_file` to the start of the previous chunk
lseek(source_file, -no_of_bytes_per_read - length_to_move, SEEK_CUR);
length_to_move = no_of_bytes_per_read;
}
Here the different steps are as follows:
STEP 6.1: Read length_to_move
bytes from source_file
and store in c
read(source_file, c, length_to_move);
STEP 6.2: Reverse the contents of c
and store in r
for (int begin = 0, end = length_to_move - 1; begin < length_to_move; begin++, end--)
r[begin] = c[end];
r[length_to_move] = '\0';
STEP 6.3: Write r
to the dest_file
write(dest_file, r, length_to_move);
STEP 6.4: Check if there are any more chunks to read, if not, then exit the loop
if (lseek(source_file, 0, SEEK_CUR) - no_of_bytes_per_read - length_to_move < 0)
break;
STEP 6.5: Move the source_file
offset to the start of the previous chunk
lseek(source_file, -no_of_bytes_per_read - length_to_move, SEEK_CUR);
length_to_move = no_of_bytes_per_read;
STEP 7: CLOSING THE FILES
After the copying has been done, we would close all the files that have been used.
close(source_file);
close(dest_file);
Complete code
Following is the complete code which we demonstrated:
// Part of OpenGenus IQ
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/types.h>
#include <unistd.h>
#include <math.h>
#include <string.h>
#include <sys/stat.h>
int main(int argc, char *argv[])
{
// STEP 1: READING THE FILES
int source_file = open("input.txt", O_RDONLY);
off_t fileLength = lseek(source_file, 0, SEEK_END);
int dest_file = open("output.txt", O_CREAT | O_RDWR | O_TRUNC, 0600);
if (source_file < 0 || dest_file < 0 || fileLength <= 0)
{
write(1, "Error in file operations", 25);
exit(1);
}
// STEP 2: REPOSITION THE FILE DESCRIPTOR OFFSETS
lseek(dest_file, 0, SEEK_SET);
lseek(source_file, -1, SEEK_END);
// STEP 3: SET THE CHUNK SIZE
int pref_byte_size = fileLength / 500;
int no_of_bytes_per_read = (pref_byte_size == 0) ? 1 : pow(2, (int)(log((double)(pref_byte_size)) / log(2.0)));
// STEP 4: DEFINE THE CHARACTER POINTERS TO STORE THE READ CHUNK AND THE REVERSED CHUNK
char *c, *r;
c = (char *)malloc(no_of_bytes_per_read);
r = (char *)malloc(no_of_bytes_per_read);
// STEP 5: SET START LOCATION AND THE INITIAL LENGTH TO MOVE
int start = ((fileLength - 1) / no_of_bytes_per_read) * no_of_bytes_per_read;
int length_to_move = fileLength - start;
lseek(source_file, start, SEEK_SET);
printf("\nFileLength: %lld \nNo of bytes per read: %d \nStart: %d \nChunk Size: %d\n\n", fileLength, no_of_bytes_per_read, start, length_to_move);
// STEP 6: THE MAIN LOOP
while (1)
{
// STEP 6.1: Read `length_to_move` bytes from `source_file` and store in `c`
printf("\nLocation: %lld\tLength read: %d\t", lseek(source_file, 0, SEEK_CUR), length_to_move);
read(source_file, c, length_to_move);
printf("Read %s\t", c);
// STEP 6.2: Reverse the contents of `c` and store in `r`
for (int begin = 0, end = length_to_move - 1; begin < length_to_move; begin++, end--)
r[begin] = c[end];
r[length_to_move] = '\0';
// STEP 6.3: Write `r` to the `dest_file`
printf("Write: %s\t", r);
fflush(stdout);
write(dest_file, r, length_to_move);
// STEP 6.4: Check if there are any more chunks to read
if (lseek(source_file, 0, SEEK_CUR) - no_of_bytes_per_read - length_to_move < 0)
break;
// STEP 6.5: Move the `source_file` to the start of the previous chunk
lseek(source_file, -no_of_bytes_per_read - length_to_move, SEEK_CUR);
length_to_move = no_of_bytes_per_read;
}
write(1, "\nDone\n", 5);
// STEP 7: CLOSING THE FILES
close(source_file);
close(dest_file);
return 0;
}
OBSERVATIONS
We see that this method of dividing the input data into chunks and then reversing and writing to the output file, is much faster than reading it character by character, because of the time overhead associated with file I/O operations. Also, the memory is not large enough to store the entire file read, at once.
With this article at OpenGenus, you must have a complete idea of reversing a large file using IO Syscalls (using chunks). Enjoy.