xv6 Lab 已经做完几天了,做完后没来得及写笔记,现在来补写一下最后两个 Lab 的笔记,可能部分代码的思考心路历程和 Debug 历程有所遗忘了 QAQ,重点可能就写一下自己的代码思路吧。
Lab9
Large files
Xv6 目前的 inode 仅支持 12 个直接索引块和 1 个一级间接索引块,由于 BSIZE
是 1KB,每个 block number 是 4 字节,1KB 里可以存放 $1024\div 4 = 256$ 个 block number,因此一个文件最多只能 refer 到 $12 + 1 \times 256 = 268$ 个 block,即文件大小最大为 268 KB。
这个 task 需要你实现一个二级索引,一个二级索引可以索引 $256 \times 256 = 65536$ 个 block,占用 1 个直接索引来实现一个二级索引,这样可以让 Xv6 支持的文件大小变为 $256\times 256+256+11 = 65803$ KB。
实现难度不大,在 bmap
里模仿它的一级索引的代码实现即可,
// Inode content
//
// The content (data) associated with each inode is stored
// in blocks on the disk. The first NDIRECT block numbers
// are listed in ip->addrs[]. The next NINDIRECT blocks are
// listed in block ip->addrs[NDIRECT].
// Return the disk block address of the nth block in inode ip.
// If there is no such block, bmap allocates one.
// returns 0 if out of disk space.
static uint
bmap(struct inode *ip, uint bn)
{
uint addr, *a;
struct buf *bp;
uint n1, n2;
if(bn < NDIRECT){
if((addr = ip->addrs[bn]) == 0){
addr = balloc(ip->dev);
if(addr == 0)
return 0;
ip->addrs[bn] = addr;
}
return addr;
}
bn -= NDIRECT;
if(bn < NINDIRECT){
// Load indirect block, allocating if necessary.
if((addr = ip->addrs[NDIRECT]) == 0){
addr = balloc(ip->dev);
if(addr == 0)
return 0;
ip->addrs[NDIRECT] = addr;
}
bp = bread(ip->dev, addr);
a = (uint*)bp->data;
if((addr = a[bn]) == 0){
addr = balloc(ip->dev);
if(addr){
a[bn] = addr;
log_write(bp);
}
}
brelse(bp);
return addr;
}
bn -= NINDIRECT;
if (bn < NINDIRECT*NINDIRECT) {
n1 = bn / NINDIRECT;
n2 = bn % NINDIRECT;
// Load double indirect block, allocating if necessary.
if((addr = ip->addrs[NDIRECT + 1]) == 0){
addr = balloc(ip->dev);
if(addr == 0)
return 0;
ip->addrs[NDIRECT + 1] = addr;
}
bp = bread(ip->dev, addr);
a = (uint*)bp->data;
if ((addr = a[n1]) == 0) {
addr = balloc(ip->dev);
if (addr) {
a[n1] = addr;
log_write(bp);
}
}
brelse(bp);
if (addr == 0)
return 0;
bp = bread(ip->dev, addr);
a = (uint*)bp->data;
if ((addr = a[n2]) == 0) {
addr = balloc(ip->dev);
if (addr) {
a[n2] = addr;
log_write(bp);
}
}
brelse(bp);
return addr;
}
panic("bmap: out of range");
}
然后再在 itrunc
里模仿它的实现添加代码,
// Truncate inode (discard contents).
// Caller must hold ip->lock.
void
itrunc(struct inode *ip)
{
int i, j;
struct buf *bp, *bp2;
uint *a, *a2;
for(i = 0; i < NDIRECT; i++){
if(ip->addrs[i]){
bfree(ip->dev, ip->addrs[i]);
ip->addrs[i] = 0;
}
}
if(ip->addrs[NDIRECT]){
bp = bread(ip->dev, ip->addrs[NDIRECT]);
a = (uint*)bp->data;
for(j = 0; j < NINDIRECT; j++){
if(a[j])
bfree(ip->dev, a[j]);
}
brelse(bp);
bfree(ip->dev, ip->addrs[NDIRECT]);
ip->addrs[NDIRECT] = 0;
}
if (ip->addrs[NDIRECT + 1]) {
bp = bread(ip->dev, ip->addrs[NDIRECT + 1]);
a = (uint*)bp->data;
for (i = 0; i < NINDIRECT; ++i) {
if (a[i]) {
bp2 = bread(ip->dev, a[i]);
a2 = (uint*)bp2->data;
for (j = 0; j < NINDIRECT; ++j) {
if (a2[j])
bfree(ip->dev, a2[j]);
}
brelse(bp2);
bfree(ip->dev, a[i]);
a[i] = 0;
}
}
brelse(bp);
bfree(ip->dev, ip->addrs[NDIRECT + 1]);
ip->addrs[NDIRECT + 1] = 0;
}
ip->size = 0;
iupdate(ip);
}
最后你还需要修改 fs.h
的相关定义,
#define NDIRECT 11
#define NINDIRECT (BSIZE / sizeof(uint))
#define MAXFILE (NDIRECT + NINDIRECT + NINDIRECT*NINDIRECT)
// On-disk inode structure
struct dinode {
short type; // File type
short major; // Major device number (T_DEVICE only)
short minor; // Minor device number (T_DEVICE only)
short nlink; // Number of links to inode in file system
uint size; // Size of file (bytes)
uint addrs[NDIRECT+2]; // Data block addresses
};
Symbolic links
这个 task 的任务是实现一个软链接,即 Linux 中的 ln -s
,软链接我们平时都还经常使用的。
软链接类似于 Windows 的快捷方式,只是提供一个目标文件的路径信息,并且在创建软链接的时候即使路径信息指向的文件不存在也可以成功。
实现思路不难,利用 create
创建一个新的 inode
,这个 inode
的 data
就填入路径字符串。 创建软链接的代码实现如下:
uint64
sys_symlink(void) {
char path[MAXPATH], target[MAXPATH];
struct inode *ip;
int len;
if((len = argstr(0, target, MAXPATH)) < 0 || argstr(1, path, MAXPATH) < 0)
return -1;
begin_op();
if((ip = create(path, T_SYMLINK, 0, 0)) == 0){
end_op();
return -1;
}
if(writei(ip, 0, (uint64)target, 0, len) != len) {
iunlockput(ip);
end_op();
return -1;
}
iunlockput(ip);
end_op();
return 0;
}
接下来需要修改 sys_open
来实现路径查询。在拿到 inode 后,我们要做的第一件事就是应该判断这是不是一个 T_SYMLINK
类型(即是不是软链接),如果是软链接,我们需要通过软链接得到真实目标文件的 inode。软链接可能是递归嵌套的,这里我使用一个循环来查找软链接指向的目标文件,当循环次数超过 10(嵌套深度超过 10),我们认为这是一个不合法的“软链接环”。如果不是软链接,接下来再依次处理文件类型是 T_DIR
、T_DEVICE
的情况。namei
可以根据路径字符串找到我们需要的 inode
。
sysopen
的实现代码如下:
uint64
sys_open(void)
{
char path[MAXPATH];
int fd, omode;
struct file *f;
struct inode *ip;
int n, depth, ok;
argint(1, &omode);
if((n = argstr(0, path, MAXPATH)) < 0)
return -1;
begin_op();
if(omode & O_CREATE){
ip = create(path, T_FILE, 0, 0);
if(ip == 0){
end_op();
return -1;
}
} else {
if((ip = namei(path)) == 0){
end_op();
return -1;
}
ilock(ip);
}
if (ip->type == T_SYMLINK && !(omode & O_NOFOLLOW)) {
ok = 0;
for (depth = 0; depth < 10; ++depth) {
if (readi(ip, 0, (uint64)path, 0, min(ip->size, MAXPATH)) != ip->size) {
iunlockput(ip);
end_op();
return -1;
}
path[ip->size] = '\0';
iunlockput(ip);
if ((ip = namei(path)) == 0) {
end_op();
return -1;
}
ilock(ip);
if (ip->type != T_SYMLINK) {
ok = 1;
break;
}
}
if (!ok) {
iunlockput(ip);
end_op();
return -1;
}
}
if(ip->type == T_DIR && omode != O_RDONLY){
iunlockput(ip);
end_op();
return -1;
}
if(ip->type == T_DEVICE && (ip->major < 0 || ip->major >= NDEV)){
iunlockput(ip);
end_op();
return -1;
}
if((f = filealloc()) == 0 || (fd = fdalloc(f)) < 0){
if(f)
fileclose(f);
iunlockput(ip);
end_op();
return -1;
}
if(ip->type == T_DEVICE){
f->type = FD_DEVICE;
f->major = ip->major;
} else {
f->type = FD_INODE;
f->off = 0;
}
f->ip = ip;
f->readable = !(omode & O_WRONLY);
f->writable = (omode & O_WRONLY) || (omode & O_RDWR);
if((omode & O_TRUNC) && ip->type == T_FILE){
itrunc(ip);
}
iunlock(ip);
end_op();
return fd;
}
测试
完整代码
整个 Lab 的完整代码如下:
diff --git a/Makefile b/Makefile
index 8e74f52..3c04216 100644
--- a/Makefile
+++ b/Makefile
@@ -188,6 +188,7 @@ UPROGS=\
$U/_grind\
$U/_wc\
$U/_zombie\
+ $U/_symlinktest\
diff --git a/kernel/fcntl.h b/kernel/fcntl.h
index 44861b9..9041d12 100644
--- a/kernel/fcntl.h
+++ b/kernel/fcntl.h
@@ -3,3 +3,4 @@
#define O_RDWR 0x002
#define O_CREATE 0x200
#define O_TRUNC 0x400
+#define O_NOFOLLOW 0x010
diff --git a/kernel/file.h b/kernel/file.h
index b076d1d..5c4eb3a 100644
--- a/kernel/file.h
+++ b/kernel/file.h
@@ -26,7 +26,7 @@ struct inode {
short minor;
short nlink;
uint size;
- uint addrs[NDIRECT+1];
+ uint addrs[NDIRECT+2];
};
// map major device number to device functions.
diff --git a/kernel/fs.c b/kernel/fs.c
index c6bab15..a36c4c1 100644
--- a/kernel/fs.c
+++ b/kernel/fs.c
@@ -384,6 +384,7 @@ bmap(struct inode *ip, uint bn)
{
uint addr, *a;
struct buf *bp;
+ uint n1, n2;
if(bn < NDIRECT){
if((addr = ip->addrs[bn]) == 0){
@@ -416,6 +417,42 @@ bmap(struct inode *ip, uint bn)
brelse(bp);
return addr;
}
+ bn -= NINDIRECT;
+
+ if (bn < NINDIRECT*NINDIRECT) {
+ n1 = bn / NINDIRECT;
+ n2 = bn % NINDIRECT;
+ // Load double indirect block, allocating if necessary.
+ if((addr = ip->addrs[NDIRECT + 1]) == 0){
+ addr = balloc(ip->dev);
+ if(addr == 0)
+ return 0;
+ ip->addrs[NDIRECT + 1] = addr;
+ }
+ bp = bread(ip->dev, addr);
+ a = (uint*)bp->data;
+ if ((addr = a[n1]) == 0) {
+ addr = balloc(ip->dev);
+ if (addr) {
+ a[n1] = addr;
+ log_write(bp);
+ }
+ }
+ brelse(bp);
+ if (addr == 0)
+ return 0;
+ bp = bread(ip->dev, addr);
+ a = (uint*)bp->data;
+ if ((addr = a[n2]) == 0) {
+ addr = balloc(ip->dev);
+ if (addr) {
+ a[n2] = addr;
+ log_write(bp);
+ }
+ }
+ brelse(bp);
+ return addr;
+ }
panic("bmap: out of range");
}
@@ -426,8 +463,8 @@ void
itrunc(struct inode *ip)
{
int i, j;
- struct buf *bp;
- uint *a;
+ struct buf *bp, *bp2;
+ uint *a, *a2;
for(i = 0; i < NDIRECT; i++){
if(ip->addrs[i]){
@@ -448,6 +485,27 @@ itrunc(struct inode *ip)
ip->addrs[NDIRECT] = 0;
}
+ if (ip->addrs[NDIRECT + 1]) {
+ bp = bread(ip->dev, ip->addrs[NDIRECT + 1]);
+ a = (uint*)bp->data;
+ for (i = 0; i < NINDIRECT; ++i) {
+ if (a[i]) {
+ bp2 = bread(ip->dev, a[i]);
+ a2 = (uint*)bp2->data;
+ for (j = 0; j < NINDIRECT; ++j) {
+ if (a2[j])
+ bfree(ip->dev, a2[j]);
+ }
+ brelse(bp2);
+ bfree(ip->dev, a[i]);
+ a[i] = 0;
+ }
+ }
+ brelse(bp);
+ bfree(ip->dev, ip->addrs[NDIRECT + 1]);
+ ip->addrs[NDIRECT + 1] = 0;
+ }
+
ip->size = 0;
iupdate(ip);
}
diff --git a/kernel/fs.h b/kernel/fs.h
index 139dcc9..3a49916 100644
--- a/kernel/fs.h
+++ b/kernel/fs.h
@@ -24,9 +24,9 @@ struct superblock {
#define FSMAGIC 0x10203040
-#define NDIRECT 12
+#define NDIRECT 11
#define NINDIRECT (BSIZE / sizeof(uint))
-#define MAXFILE (NDIRECT + NINDIRECT)
+#define MAXFILE (NDIRECT + NINDIRECT + NINDIRECT*NINDIRECT)
// On-disk inode structure
struct dinode {
@@ -35,7 +35,7 @@ struct dinode {
short minor; // Minor device number (T_DEVICE only)
short nlink; // Number of links to inode in file system
uint size; // Size of file (bytes)
- uint addrs[NDIRECT+1]; // Data block addresses
+ uint addrs[NDIRECT+2]; // Data block addresses
};
// Inodes per block.
diff --git a/kernel/stat.h b/kernel/stat.h
index 19543af..9554d30 100644
--- a/kernel/stat.h
+++ b/kernel/stat.h
@@ -1,6 +1,7 @@
#define T_DIR 1 // Directory
#define T_FILE 2 // File
#define T_DEVICE 3 // Device
+#define T_SYMLINK 4 // Symbolic link
struct stat {
int dev; // File system's disk device
diff --git a/kernel/syscall.c b/kernel/syscall.c
index ed65409..52233e8 100644
--- a/kernel/syscall.c
+++ b/kernel/syscall.c
@@ -101,6 +101,7 @@ extern uint64 sys_unlink(void);
extern uint64 sys_link(void);
extern uint64 sys_mkdir(void);
extern uint64 sys_close(void);
+extern uint64 sys_symlink(void);
// An array mapping syscall numbers from syscall.h
// to the function that handles the system call.
@@ -126,6 +127,7 @@ static uint64 (*syscalls[])(void) = {
[SYS_link] sys_link,
[SYS_mkdir] sys_mkdir,
[SYS_close] sys_close,
+[SYS_symlink] sys_symlink,
};
void
diff --git a/kernel/syscall.h b/kernel/syscall.h
index bc5f356..13818da 100644
--- a/kernel/syscall.h
+++ b/kernel/syscall.h
@@ -20,3 +20,4 @@
#define SYS_link 19
#define SYS_mkdir 20
#define SYS_close 21
+#define SYS_symlink 22
diff --git a/kernel/sysfile.c b/kernel/sysfile.c
index 16b668c..e24d259 100644
--- a/kernel/sysfile.c
+++ b/kernel/sysfile.c
@@ -16,6 +16,8 @@
#include "file.h"
#include "fcntl.h"
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
// Fetch the nth word-sized system call argument as a file descriptor
// and return both the descriptor and the corresponding struct file.
static int
@@ -308,7 +310,7 @@ sys_open(void)
int fd, omode;
struct file *f;
struct inode *ip;
- int n;
+ int n, depth, ok;
argint(1, &omode);
if((n = argstr(0, path, MAXPATH)) < 0)
@@ -328,13 +330,41 @@ sys_open(void)
return -1;
}
ilock(ip);
- if(ip->type == T_DIR && omode != O_RDONLY){
+ }
+
+ if (ip->type == T_SYMLINK && !(omode & O_NOFOLLOW)) {
+ ok = 0;
+ for (depth = 0; depth < 10; ++depth) {
+ if (readi(ip, 0, (uint64)path, 0, min(ip->size, MAXPATH)) != ip->size) {
+ iunlockput(ip);
+ end_op();
+ return -1;
+ }
+ path[ip->size] = '\0';
+ iunlockput(ip);
+ if ((ip = namei(path)) == 0) {
+ end_op();
+ return -1;
+ }
+ ilock(ip);
+ if (ip->type != T_SYMLINK) {
+ ok = 1;
+ break;
+ }
+ }
+ if (!ok) {
iunlockput(ip);
end_op();
return -1;
}
}
+ if(ip->type == T_DIR && omode != O_RDONLY){
+ iunlockput(ip);
+ end_op();
+ return -1;
+ }
+
if(ip->type == T_DEVICE && (ip->major < 0 || ip->major >= NDEV)){
iunlockput(ip);
end_op();
@@ -503,3 +533,29 @@ sys_pipe(void)
}
return 0;
}
+
+uint64
+sys_symlink(void) {
+ char path[MAXPATH], target[MAXPATH];
+ struct inode *ip;
+ int len;
+
+ if((len = argstr(0, target, MAXPATH)) < 0 || argstr(1, path, MAXPATH) < 0)
+ return -1;
+
+ begin_op();
+ if((ip = create(path, T_SYMLINK, 0, 0)) == 0){
+ end_op();
+ return -1;
+ }
+
+ if(writei(ip, 0, (uint64)target, 0, len) != len) {
+ iunlockput(ip);
+ end_op();
+ return -1;
+ }
+ iunlockput(ip);
+ end_op();
+
+ return 0;
+}
\ No newline at end of file
diff --git a/user/user.h b/user/user.h
index 4d398d5..a78cb6d 100644
--- a/user/user.h
+++ b/user/user.h
@@ -22,6 +22,7 @@ int getpid(void);
char* sbrk(int);
int sleep(int);
int uptime(void);
+int symlink(const char*, const char*);
// ulib.c
int stat(const char*, struct stat*);
diff --git a/user/usys.pl b/user/usys.pl
index 01e426e..bc5c22e 100755
--- a/user/usys.pl
+++ b/user/usys.pl
@@ -36,3 +36,4 @@ entry("getpid");
entry("sbrk");
entry("sleep");
entry("uptime");
+entry("symlink");
Xv6 Book Chapter 8
Xv6 金句摘录,以下仅记录 Xv6 book 中笔者觉得 make sense 或者有用或者自己不太熟悉的部分:
The logging layer allows higher layers to wrap updates to several blocks in a transaction, and ensures that the blocks are updated atomically in the face of crashes (i.e., all of them are updated or none).
The file descriptor layer abstracts many Unix resources (e.g., pipes, devices, files, etc.) using the file system interface, simplifying the lives of application programmers.
Disk hardware traditionally presents the data on the disk as a numbered sequence of 512-byte blocks (also called sectors): sector 0 is the first 512 bytes, sector 1 is the next, and so on.
The block size that an operating system uses for its file system maybe different than the sector size that a disk uses, but typically the block size is a multiple of the sector size. (Xv6 中 block size = 2 * sector size)
The superblock is filled in by a separate program, called mkfs, which builds an initial file system.
The buffer cache recycles the least recently used buffer for the new block.
The field disk
indicates that the buffer content has been handed to the disk, which may change the buffer (e.g., write data from the disk into data).
Note that the assignment b->valid = 0
ensures that bread will read the block data from disk rather than incorrectly using the buffer’s previous contents.
The sleep-lock protects reads and writes of the block’s buffered content, while the bcache.lock
protects information about which blocks are cached.
The name brelse, a shortening of b-release, is cryptic but worth learning: it originated in Unix and is used in BSD, Linux, and Solaris too.
Xv6 solves the problem of crashes during file-system operations with a simple form of logging. An xv6 system call does not directly write the on-disk file system data structures. Instead, it places a description of all the disk writes it wishes to make in a log on the disk. Once the system call has logged all of its writes, it writes a special commit record to the disk indicating that the log contains a complete operation.
After recovery, either all of the operation’s writes appear on the disk, or none of them appear.
The idea of committing several transactions together is known as group commit. Group commit reduces the number of disk operations because it amortizes the fixed cost of a commit over multiple operations.
Xv6’s virtio driver doesn’t support this kind of batching, but xv6’s file system design allows for it.
Xv6 dedicates a fixed amount of space on the disk to hold the log. The total number of blocks written by the system calls in a transaction must fit in that space.
Xv6’s write system call breaks up large writes into multiple smaller writes that fit in the log, and unlink doesn’t cause problems because in practice the xv6 file system uses only one bitmap block.
The term inode can have one of two related meanings. It might refer to the on-disk data structure containing a file’s size and list of data block numbers. Or “inode” might refer to an in-memory inode, which contains a copy of the on-disk inode as well as extra information needed within the kernel.
iput()
can write to the disk. This, in turn, means that even read-only system calls must be wrapped in transactions if they use the file system.
For example, next
points to the same inode as ip when looking up “.”. Locking next before releasing the lock on ip
would result in a deadlock. To avoid this deadlock, namex
unlocks the directory before obtaining a lock on next. Here again we see why the separation between iget
and ilock
is important. (Note:inode 和 block 在 read (即只读)之前都需要上锁)
Xv6’s logging system is inefficient. A commit cannot occur concurrently with file-system system calls.
Xv6 uses the same basic on-disk layout of inodes and directories as early UNIX; this scheme has been remarkably persistent over the years. BSD’s UFS/FFS and Linux’s ext2/ext3 use essentially the same data structures.
Microsoft Windows’s NTFS, macOS’s HFS, and Solaris’s ZFS, just to name a few, implement a directory as an on-disk balanced tree of blocks. This is complicated but guarantees logarithmic-time directory lookups.