From e9a3585174e0fda1ac580d0e720f7ecc48bb9e9f Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Fri, 10 Jan 2020 11:38:48 +0100 Subject: [PATCH] Initial commit. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit These files were created 2015 and orginally contained only the material that was used for the Unix courses held back then at the MPI for development biology in Tübingen. Over time, more content was added and existing contents have been improved. Development was tracked within the non-public "user-info" git repository that also contains unrelated contents for internal use within the institute. In 2019-12 the title was changed to "Advanced Problems in the Linux Environment" and the pages were made public. To give interested readers access to the source code, contents were moved to a dedicated repository that was made public in 2020-01. No attempts were made to filter out the 269 commits of the user-info repo that touched the files of the unix course. --- Bash.m4 | 867 +++++++++++++ Command_Line_Utilities.m4 | 571 ++++++++ Debugging.m4 | 205 +++ Filesystems.m4 | 1471 +++++++++++++++++++++ Git.m4 | 1092 ++++++++++++++++ Gridengine.m4 | 740 +++++++++++ Introduction.m4 | 251 ++++ LVM.m4 | 919 +++++++++++++ Makefile | 64 + Networking.m4 | 629 +++++++++ OS-Level_Virtualization.m4 | 823 ++++++++++++ Unix_Concepts.m4 | 2527 ++++++++++++++++++++++++++++++++++++ include/css/aple.css | 96 ++ include/imgs/aple.svg | 58 + include/m4/aple.m4 | 107 ++ 15 files changed, 10420 insertions(+) create mode 100644 Bash.m4 create mode 100644 Command_Line_Utilities.m4 create mode 100644 Debugging.m4 create mode 100644 Filesystems.m4 create mode 100644 Git.m4 create mode 100644 Gridengine.m4 create mode 100644 Introduction.m4 create mode 100644 LVM.m4 create mode 100644 Makefile create mode 100644 Networking.m4 create mode 100644 OS-Level_Virtualization.m4 create mode 100644 Unix_Concepts.m4 create mode 100644 include/css/aple.css create mode 100644 include/imgs/aple.svg create mode 100644 include/m4/aple.m4 diff --git a/Bash.m4 b/Bash.m4 new file mode 100644 index 0000000..3c1d898 --- /dev/null +++ b/Bash.m4 @@ -0,0 +1,867 @@ +TITLE(« + + Write programs that do one thing and do it well. Write programs + to work together. Write programs to handle text streams, + because that is a universal interface. -- Doug MacIlroy + +», __file__) + +SECTION(«CMD(«sh») versus CMD(«bash»)») + +- Bash scripts begin with a sha-bang: CMD(«#!/bin/bash») +- sh is the POSIX defined shell specification. +- Bash is one implementation of the sh specification. +- /bin/sh links to the default shell of your system. +- This can be different from your user shell! +- Each shell has its idiosyncracies. +- Using a sha-bang pointing to bash is safer CMD(«#!/bin/bash») +- < 10.3, the default Mac OS X shell is tcsh (bad!) +- Scripts need to be executable (chmod u+x script). + +EXERCISES() +- Your current shell is stored the CMD($SHELL) environment variable. +Use echo to find out what it is. +- Similarly, find out your CMD($BASH_VERSION). +- Use readlink to find out what CMD(/bin/sh) points to in your system. + +HOMEWORK(« +Explain why bash does not exit when you type CMD(«Ctrl+C») on the +command line. +») + +SECTION(«Variables») +- Variables are defined using CMD(«=») (no spaces!). +- Variables are read using CMD(«$»). +- Spaces are the enemy of variables. Spaces break variables. +- Double quotes CMD(«"») a the defense against spaces. +- braces (curly brackets) CMD(«{}») can also protect variables from +ambiguity. eg: CMD(«${foo}»)bar. They also group commands. +- Single quotes CMD(«'») are like double quotes, but are literal. +- Bash scripts have special variables: + - CMD(«$0»): script full path and name. + - CMD(«$1»): first command line argument. + - CMD(«$2»): second argument ... etc. + - CMD(«$#»): number of command line arguments. + - CMD(«$*»): list of arguments as a single string. + - CMD(«$@»): list of arguments as a delimited list. +- Parentheses CMD(«()») execute a command in a sub-shell. +- Double parentheses return the result of arithmetic expansion +(positive integer only). + +EXERCISES() + +- Write a simple script in which you define a variable with the string +"Hello World!". echo this variable without quotes, with single and +double quotes, and with braces (again with and without different +quotes). Become comfortable with the results. +- How do you return the results of a sub-shell ()? +- Write a simple script to add two positive integers. +- Write a simple script to add two positive integers supplied as +arguments to the script. +») + +HOMEWORK(« +Write a script using your favorite editor. The script should display +the path to your home directory and the terminal type that you +are using. +») + +SECTION(«Tests») + +- CMD(«[...]») is the POSIX sh test function. +- CMD(«[[...]]») is the Bash test function (more powerful). +- These tests are logical: they return TRUE or FALSE. +- Tests use logical operators. +- Spaces are a must! +- There are three types of operators: File, String and Integer. +- A few single file operators eg: CMD(«[[ -e somefile ]]») + - CMD(«-e»): file exists + - CMD(«-s»): file not zero size + - CMD(«-d»): file is a directory + - CMD(«-r»): you have read permission + - CMD(«-O»): you are the owner of the file +- A few multiple file operators eg: CMD(«[[ file1 -nt file2 ]]») + - CMD(«-nt»): first file newer than second + - CMD(«-ot»): first file older than second +- A few integer operators: + - CMD(«-eq»): equal to + - CMD(«-ne»): not equal to + - CMD(«-gt»): greater than + - CMD(«-ge»): greater than or equal to + - CMD(«-lt»): less than + - CMD(«-le»): less than or equal to +- A few string operators: + - CMD(«==»): equal to + - CMD(«!=»): not equal to + - CMD(«=~»): regex match (Bash specific) + - CMD(«-z»): string is null (zero length) + - CMD(«-n»): string is not null (zero length) +- When you understand how these operators work, you will have a good +idea of the kinds of things you can do in Bash. +- Tests can be combined with CMD(«&&»): "and" and CMD(«||») "or". + +HOMEWORK(« +Write a script that checks whether or not you own a file, and reports +back if you do not. (This is useful if you are working on multiple +user systems and need your script to remove many files. +») + +SECTION(«Conditionals») + +- The most commonly used Bash conditional structure is: CMD(«if») +... CMD(«then») ... CMD(«fi») +- A shorter version uses logic in place of if..then..fi. eg: CMD(«[[ +test ]] && { execute if TRUE; also execute }») + +EXERCISES() + +- Modify your calculator script to check for valid inputs: + - There must be two. + - They should not have letters +- Write a script to check for a directory, and create it if it +doesn't exist. +- Write a script to remove a file specfied as an argument, but only +if you are its owner. + + +HOMEWORK(« +Write a script that takes exactly one argument, a directory name. If +the number of arguments is more or less than one, print a usage +message. If the argument is not a directory, print another message. For +the given directory, print the five biggest files and the five files +that were most recently modified. +») + +SECTION(«Loops») + +- The most commonly used Bash loop structure is: CMD(for ... do +... done) +- The CMD(for) statement behaves a lot like a variable assignment. +- File globbing works: CMD(«for file in *.txt; do; done») +- Sequences are also useful: + - CMD(«for num in {1..5}; do; echo $num; done 1 2 3 4 5») + - CMD(«for num in {1..10..2}; do; echo $num; done 1 3 5 7 9») + +EXERCISES() +- Write a script that stores the results of a arithmetic in files +named by the inputs. + +HOMEWORK(« +Come up with a for loop which prints the first 10 squares (1, 4, +9, ...). +») + +SECTION(«Pipes and Here Strings») + +- Here strings are an alternative to conventional piping. +- Because they do not spawn a sub-shell, they retain variables in +the shell the script is running in. eg: instead of CMD(head file1 | +cut -f 1) write CMD(head | cut -f 1 <<< file1) +- They can be easier or more difficult to read, depending on your +taste. + +EXERCISES() +Write a script that uses pipes, change it to use a Here string. + + +HOMEWORK(« +Tie all the above together with the following task: + +Let's say that you want to perform an analysis by population +(k-means) cluster of some accessions (ecotypes). You want to +generate a separate bed file with the SNPs of the members of +each cluster (which you have previously calculated). + +The relevant plink argument is: CMD(«--keep “$keeplist”») where +keeplist is a two column file specifying family and ecotype +(made for human data). We will just duplicate the ecotype +in the two columns. e.g.: + + > cat keeplist + 88 88 + 107 107 + etc. + +I have provided a comma-separated file of which cluster each ecotype +belongs to: CMD(«/tmp/cluster_course/admix_pop.csv») Take a look at +this file. You will see that it is a comma separated file with +ecotype ID numbers in the first column and the cluster assignment +in the second. + +Use the 1001_cluster_course files as your test dataset. + +You suspect that your clusters might change, (in assignment +and number ), so you want to write a Bash script to generate +separate bed files for a given clustering. + +Hints: + +- Your script will be called something like this: + + sep_clust.sh all_snps_bedfile_root cluster_assignment.csv + +- You will have a loop. +- You will generate a list of cluster numbers from the +CMD(«cluster_assignment.csv») file. +- The cluster file has a header! CMD(«tail -n+2») will skip to the +second line. +- CMD(«grep “something$”») matches CMD(«something») at the +end of a line. +- You will generate a “keep” list for each cluster and supply +that to plink. +- CMD(«cut») expects tab-delimited input, but ours is +comma-delimited. Use CMD(«-d ","»). +- The keep list needs the ecotypes printed twice per line. The easiest +thing to use in this case is CMD(«awk»): + + awk -v OFS='\t' '{print $1, $1}' + +- Here, CMD(«-v») changes an internal value (CMD(«OFS»), the +“Output Field Separator”), CMD(«\t») specifies the delimiter +(a tab), CMD(«{...}») is the command, and CMD(«print $1, $1») +is the command to print column 1, column 1. + +- Remember: + - CMD(«uniq») requires sorted input. + - CMD(«sort -n») specifies a numerical sort. + - Generate as few temporary files as possible. +») + +HOMEWORK(« +If important data have been copied from one system to another, +one might want to check that both copies are identical. This is +fairly easy if they both live on the same system, but can be quite +tricky if they don't. For example, imagine files copied from an +NFS-mounted directory to the local hard drive. + + +», « + + + +») + +SECTION(«Substitution and Expansion») + +- expansion is performed on the command line after it has been split +into words +- several kinds of expansion: tilde, brace, arithmetic, pathname, +parameter and variable, history +- command substitution + +EXERCISES() + +- Give an example for each type of expansion. +- Which expansions can change the number of words? +- Create a list of "words" with CMD(«apg -c 42 -n 10000 > +foo»). Transform each word to upper case, using the case-modification +operator CMD(«^^») as follows: CMD(«while read a; do echo ${a^^}; +done < foo > bar»). Compare the running time of this command with (a) +CMD(«tr [a-z] [A-Z] < foo > bar») and (b) CMD(«while read a; do tr +[a-z] [A-Z] <<< "$a"; done < foo > bar»). Try to beat the fastest +implementation using your favorite tool (CMD(«sed»), CMD(«perl»), +CMD(«python»), ...). +- The command CMD(«find . -maxdepth 1 -mindepth 1 -type d») lists +all directories in the CWD. Describe an CMD(«ls») command which +does the same. +- Scripts often contain code like CMD(«find . | while read f; do +something with "$f"; done»). While this code works for file names +which contain spaces or tab characters, it is not bullet-proof because +file names may also contain the newline character. The only character +that can not be part of a file name is the null character. This is +why CMD(«find(1)») has the CMD(«--print0») option to separate the +file names in its output by null characters rather than the newline +character. Find a way to make the CMD(«while») loop work when it +is fed a file list produced with CMD(«find --print0»). Check the +correctness of your command by using CMD(«printf 'file\n1\0file 2'») +as the left hand side of the pipe. +») + +HOMEWORK(« +- Write a shell script CMD(«varstate») which takes the name of a +variable as CMD(«$1») and determines whether the named variable +is (1) set to a non-empty string, (2) set to the empty string, or +(3) unset. Verify the correctness of your script with the following +commands: + - CMD(«foo=bar ./varstate foo # case (1)») + - CMD(«foo= ./varstate foo # case (2)») + - CMD(«unset foo; ./varstate foo # case (3)») +») + +SECTION(«Functions») + +- code block that implements a set of operations +- for tasks which repeat with only slight variations +- syntax: CMD(«f() {commands; }») +- positional parameters (CMD(«$1»), CMD(«$2»), ...) +- special parameters: CMD(«$*»), CMD(«$@»), CMD(«$#») + +EXERCISES() + +- Understand your smiley of the day (and run it if you are brave): +CMD(«:() { :& :& };:») +- Write a function which checks whether the passed string is a +decimal number. +- Consider this simple function which returns its first argument: +CMD(«foo() { return $1; }»). Find the largest positive integer this +function can return. +- Write a function which returns the sum of the first and the 10th +argument. + +SECTION(«Arrays and Hashes») + +- bash-2: arrays, bash-4: hashes (associative arrays) +- zero-based, one-dimensional only +- three ways of assigning values +- negative parameters in arrays and string-extraction (bash-4.2) + +EXERCISES() + +- The following three array assignments are equivalent: +CMD(arr=(one two three)), CMD(«arr=([0]=one [1]=two [2]=three)»), +CMD(«arr[0]=one; arr[1]=two; arr[2]=three»). Discuss the pros and +cons of each version. +- Define an array with CMD(«arr=(one two three)»). + - Learn how to determine the number of elements that have been assigned + (three in this example). + - Convert all entries to upper case without iterating. + - Print all entries which do not contain an CMD(«"o"») character, + again without iterating (result: CMD(«three»)). +- Use arrays to write a bash script that lists itself, including +line numbers, and does not call any external command (CMD(«sed, +awk, perl, python, ...»)). Try to get rid of the loop in this +REFERENCE(«self-list.bash», «solution»), +- CMD(«rot13») is a simple "encryption" algorithm which shifts each +letter in the alphabet a-z by 13 characters and leaves non-letter +characters unchanged. That is, CMD(«a») maps to CMD(«n»), +CMD(«b») maps to CMD(«o»), ..., CMD(«m») maps to CMD(«z»), +CMD(«n») maps to CMD(«a»), and so on. Implement CMD(«rot13») +using an associative array. Compare your solution with this +REFERENCE(«rot13.bash», «implementation») which reads from +stdin and writes to stdout. Verify that "encrypting" twice with +CMD(«rot13») is a no-op. +- Examine the CMD(BASH_VERSINFO) array variable to check whether the +running bash instance supports negative array indices. +- Write a bash script which reads from stdin and prints the last word +of the input. + +HOMEWORK(« +Bash-4.2 added support for negative array indices and string +extraction (count backward from the last element). Apply this feature +to print all but the first and last character of the last word of +each input line. +», « +The script below implements a loop which reads lines from stdin into +an array. In each interation of the loop we use CMD(«${arr[-1]}») +to get the last word of the line. Substring expansion with -1 as the +offset value refers to the last character within the word. + + #!/bin/bash + # The -a option assigns the words of each line to an array variable. + while read -a arr; do + # + # If the input line contains only whitespace, there is + # nothing to do. + ((${#arr[@]} == 0)) && continue + # + # Negative array indices count back from the end of the + # array. In particular the index -1 references the last + # element. Hence ${arr[-1]} is the last word. + # + # To print the first and the last character of the last + # word, we use substring expansion: + # ${parameter:offset:length} expands to up to length + # characters of the value of parameter starting at the + # character specified by offset. As for array indices, + # negative offsets are allowed and instruct bash to use + # the value as an offset from the *end* of the value, + # with -1 being the last character. + # + # A negative offset must be separated from the colon by + # a space to avoid confusion with the :- expansion (use + # default values). For example, ${var:-1:1} expands to + # the string "1:1" if a is unset (and the value of var + # otherwise). + echo "${arr[-1]: 0: 1} ${arr[-1]: -1: 1}" + done + +») + +SECTION(«Signals») + +- trap +- exit code 128 + n + +EXERCISES() + +- Run CMD(«sleep 10»), interrupt the command with CMD(«CTRL+C») and +examine CMD(«$?»). Hint: CMD(«trap -l») prints all signal numbers. +- The REFERENCE(«stale_tmpfile.bash», «script») below is flawed +in that it leaves a stale temporary file when interrupted with +CMD(«CTRL+C»). Fix this flaw by trapping CMD(«SIGINT»). + +SECTION(«Shell Options») + +- Confusing: + - _many_ options, some really weird ones + - two ways to set options: CMD(«set»), CMD(«shopt») + - CMD(«set +option») _disables_ CMD(«option») +- aim: Introduce examples for the most useful options +- CMD(«-x»): debugging +- CMD(«-u»): parameter expansion is treated as error for unset variables +- CMD(«-e»): exit on first error +- pipefail: Get _all_ exit codes of a pipeline +- nullglob: avoid common pitfalls with pathname expansion +- extglob: activate extended pattern matching features + +EXERCISES() + +- Find at least two bugs in the REFERENCE(«catch_the_bug.bash», +«script») below. Run the script twice, once with +CMD(«bash catch_the_bug.bash») and once with CMD(«bash -x +catch_the_bug.bash»). Compare the output. +- There is a subtle bug in the the +REFERENCE(«HungarianCamelSquare.bash», «HungarianCamelSquare.bash») +script below. Run the script with and without bash's CMD(«-u») option +and compare the error messages. Discuss whether it is reasonable to +add CMD(«set -u») to existing scripts. +- What's the exit code of the pipeline CMD(«/notthere | wc -l»)? +Run CMD(«set -o pipefail»), then repeat the command. Search the bash +man page for CMD(«pipefail») and learn about the CMD(«PIPESTATUS») +array variable. Repeat the above command and examine the contents +of CMD(«PIPESTATUS»). +- Assume that CMD(«/etc») contains only "reasonable" file +names (without space or other "funny" characters). Yet the +REFERENCE(«count_config_files.bash», «count_config_files.bash») +script is buggy. Point out the flaw _before_ you try it out, then run +it to confirm. Insert CMD(«shopt -s nullglob») before the loop and +run the script again. Search the bash manual page for CMD(«nullglob») +for an explanation. + +HOMEWORK(« +The REFERENCE(«rm_tmp.bash», «rm_tmp.bash») script is seriously +flawed and would sooner or later create major grief if the CMD(«rm») +command was not commented out. Find at least three bugs in it. Run +CMD(«bash rm_tmp.bash /notthere») and CMD(«bash -e rm_tmp.bash +/notthere») to see the CMD(«-e») option in action. +», « + +- If the CMD(«cd») command fails, the CMD(«rm») command will be +executed in the current directory. This can happen for several reasons: + - CMD(«$1») does not exist, + - CMD(«$1») is not a directory, + - The executing user has no permissions to change into CMD(«$1»), + - CMD(«$1») contains whitespace characters, + - CMD(«$1») is a directory on a network share which is currently + unavailable. This does not happen with NFS, but may happen with CIFS + (Microsoft's Common Internet File System). +- If no argument is given, the CMD(«rm») command will be executed +in the home directory. +- The CMD(«rm») command does not remove all files: filenames starting +with a dot will be omitted. +- If the directory contains more files than the maximal number of +arguments in a command line, the CMD(«rm») command fails. The limit +depends on the system, but is often as low as 32768. +- If the directory contains a file named CMD(«-r»), the directory +will be removed recursively. +- If CMD(«$1») is an empty directory, the command fails because +there is no file named CMD(«"*"»). See the CMD(«nullglob») shell +option if you don't know why. +- The command fails if CMD(«$1») contains subdirectories. +- Even the CMD(«echo») command is buggy: If there is a file +CMD(«-n»), it will be treated as an option to CMD(«echo»). +») + +HOMEWORK(« +- Suppose you'd like to remove all leading occurences of the character +CMD(«"a"») from each input line. The script should read input lines +from CMD(«stdin») and write its output to CMD(«stdout»). For +example, the input line CMD(«aabba») should be transformed into +CMD(«bba»). + + - Write a bash script that runs a suitable external command of + your choice (e.g., CMD(«sed»), CMD(«awk»), CMD(«perl») or + CMD(«python»)) for each input line. + - Come up with an alternative script that does not run any commands. + - Implement yet another version that uses extended globbing. + +- Create a suitable input file with 100000 lines by running +CMD(«base64 < /dev/urandom | head -n 100000 > foo»). Test the +performance of the three implementations of the above script by +executing CMD(«time script < foo») and discuss the result. +», « +- Bash script with external command: + + #!/bin/bash + + while read line; do + sed -e 's/^a\+//' <<< "$line" + done + +- Bash script without external command (note that CMD(«printf») + is a shell builtin): + + #!/bin/bash + + while read line; do + n=0 + while [[ "${line:$n:1}" == 'a' ]]; do + let n++ + done + printf '%s\n' "${line:$n}" + done + +- Bash script with extended globbing: + + #!/bin/bash + + shopt -s extglob + + while read line; do + printf '%s\n' "${line/*(a)}" + done + +- Running times: + - external command: 289s + - without external command, without extglob: 4s + - extglob: 8s + +- Discussion: External commands hurt plenty. Try to avoid them + inside of loops which execute many times. The extglob feature is + handy but is still twice as expensive than the open-coded version + which avoids pattern matching alltogether. Note that the simple + CMD(«sed -e 's/^a\+//' foo») also does the job, and is even two + orders of magnitude faster than the fastest bash version. However, + this approach is not very flexible, hence unsuitable for real world + applications which do more than just write the transformed string + to stdout. +») + +SECTION(«Miscellaneous») + +- IFS +- read -ie +- ** (globbing) +- prompt +- Indirect variable referencing (eval, ${!x}, nameref) + +EXERCISES() + +- Write a bash script which prints the username and login shell of +each user defined in CMD(«/etc/passwd»). Hint: Set CMD(«IFS») +and employ the bash CMD(«read») builtin with suitable options to +read each line of CMD(«/etc/passwd») into an array. Compare your +solution with this REFERENCE(«print_login_shells.bash», «script»). +- Run CMD(«read -p "> " -ei "kill -9 -1" t; echo "you entered: +$t"») and note how it provides nice readline-based editing. Check +CMD(«bash») man page for other options to the CMD(«read») builtin, +like CMD(«-s») and CMD(«-t»). +- Run CMD(«ls ~/**/*.pdf»). Search the bash manual page for +CMD(«**») and CMD(«globstar») to understand the meaning of the +CMD(«**») pattern in pathname expansion. Next, run CMD(«shopt -s +globstar && ls ~/**/*.pdf») and marvel. +- Is there a way in bash to distinguish between undefined variables +and variables which have been set to the emtpy string? Hint: examine +the difference between CMD(«${x-42}») and CMD(«${x:-42}»). +- Setting the environment variable CMD(«PROMPT_COMMAND») +to a function instructs bash to call this function prior to +issuing the prompt. Run CMD(«prompt_command() { PS1="$PWD > "; +}; PROMPT_COMMAND=prompt_command») to change your prompt. Modify +the function to replace the middle part of the path by '...' if +CMD(«$PWD») exceeds 10 characters. +- During parameter expansion, if the first character of a parameter +is an exclamation point (!), bash uses the value of the variable +formed from the rest of parameter as the name of the variable rather +than the value of the parameter itself. This is known as _indirect +expansion_. Run CMD(«a=42; x=a; echo ${!x}») to see the effect. +- Examine and run the REFERENCE(«minmax.bash», «minmax script») +whose CMD(«minmax()») function is given the _name_ CMD(«X») of a +variable, and a sequence of positive integers. The function computes +the minimum and the maximum given value and sets the variables +CMD(«X_min») and CMD(«X_max») accordingly. + +HOMEWORK(« +Get rid of the CMD(«eval») statement in the +REFERENCE(«minmax.bash», «minmax script») by passing +variables declared with CMD(«-n») to assign the CMD(«namref») +attribute. Hint: search for (nameref) in the bash manual. +») + +HOMEWORK(« +Read the CMD(«bashbug») manual page and discuss +under which circumstances one should file a bug report. +Download the source code of latest version of bash from +XREFERENCE(«ftp://ftp.gnu.org/pub/gnu/bash», «gnu ftp server»), +apply all patches found in the CMD(«bash-4.3-patches») subdirectory +and compile the package. Run the compiled executable and execute +CMD(«echo ${BASH_VERSINFO[@]}»). +») + +SECTION(«Job Control») + +- suspend/resume selected processes +- POSIX.1 (1988) +- aim: understand foreground/background jobs, Ctrl+Z, Ctrl+C, +CMD(«fg»), CMD(«bg») +- job <=> process group <=> pipeline (+descendants) <=> PGID +- (interactive) session := collection of process groups +- setsid() syscall creates new session, PGID := PID of calling process +- session leader: process which called setsid(), SID: PID of session +leader +- terminal's current process group (TPGID) +- TPGID determines foreground PG = CMD(«{P: PGID(P) == TPGID}») + +EXERCISES() + +- Examine all fields in the output of CMD(«ps j»). +- Assume a typical scenario with one background process and another +process running in the foreground. How many sessions are there? Which +of the three processes are session leaders? Determine all process +groups. Verify your result by running CMD(«sleep 100 & ps j»). +- What happens if a background process tries to read from +CMD(«stdin»)? Verify your answer by executing CMD(«cat &»). +- What happens if the session leader terminates while there are +still processes running in a background process group? To find out, +open a terminal, run CMD(«sleep 100&») and kill the session leader +(the shell) with CMD(«kill -9 $$»). Open another terminal and +execute CMD(«ps -aj») and examine the row that corresponds to the +CMD(«sleep») process. +- Look at how bash handles a pipeline by executing CMD(«ps xj | +cat»). +- Verify that in the output of CMD(«ps j») the TPGID and the PID +columns coincide while the two columns differ if the command is run +in the background (CMD(«ps j &»)). Determine the foreground process +group in both cases. +- Read the section on job control in the bash manual and make yourself +familiar with the various ways to refer to a job in bash (CMD(«%»), +CMD(«%n»), CMD(«%-,»), CMD(«%+»)). + +SUPPLEMENTS() + +SUBSECTION(«stale_tmpfile.bash») + +
+
+	#!/bin/bash
+	f=$(mktemp) || exit 1
+	echo "starting analysis, temporary file: $f"
+	sleep 100
+	echo "done, removing $f"
+	rm -f "$f"
+
+
+ +SUBSECTION(«self-list.bash») + +
+
+	#!/bin/bash
+	IFS='
+	'
+	a=($(cat $0))
+	for ((i = 0; i < ${#a[@]}; i++)); do
+		echo "$((i + 1)): ${a[$i]}"
+	done
+
+
+ +SUBSECTION(«rot13.bash») + +
+
+	#!/bin/bash
+	declare -A h=(
+		[a]=n [b]=o [c]=p [d]=q [e]=r [f]=s [g]=t [h]=u [i]=v [j]=w [k]=x
+		[l]=y [m]=z [n]=a [o]=b [p]=c [q]=d [r]=e [s]=f [t]=g [u]=h [v]=i
+		[w]=j [x]=k [y]=l [z]=m
+	)
+
+	while read -r line; do
+		for ((i =0; i < ${#line}; i++)); do
+			c="${line:$i:1}"
+			echo -n ${h[$c]:-$c}
+		done
+		echo
+	done
+
+
+ +SUBSECTION(«catch_the_bug.bash») + +
+
+	#!/bin/bash
+	if (($# == 0)); then
+		# no argument given, choose a random number instead
+		x=$(($RANDOM / 3276 + 1)) # between 1 an 10
+	else
+		x=$1
+	fi
+	echo "1/$x is approximately $((100 / $x))%"
+
+
+ +SUBSECTION(«HungarianCamelSquare.bash») + +
+
+	#!/bin/bash
+	declare -i ThisVariableIsATemporaryCounter
+
+	for ((ThisVariableIsATemporaryCounter=0; ThisVariableIsATemporaryCounter < 10; ThisVariableIsATemporaryCounter++)); do
+		echo "$ThisVariableIsATemporaryCounter * $ThisVariableIsATemporaryCounter is $(($ThisVariableIsATenporaryCounter * $ThisVariableIsATemporaryCounter))"
+	done
+
+
+ +SUBSECTION(«rm_tmp.bash») + +
+
+	#!/bin/bash
+	echo "removing all temporary files in $1"
+	cd $1
+	echo removing *
+	# rm *
+
+
+ +SUBSECTION(«count_config_files.bash») + +
+
+	#!/bin/bash
+	for c in {a..z}; do
+		files=(/etc/$c*.conf)
+		echo "There are ${#files[@]} config files in /etc that start with $c: ${files[@]}"
+	done
+
+
+ +SUBSECTION(«print_login_shells.bash») + +
+
+	#!/bin/bash
+	while IFS=: read -ra a; do
+		echo "${a[0]} ${a[6]}"
+	done < /etc/passwd
+
+
+ +SUBSECTION(«minmax.bash») + +
+	minmax()
+	{
+		local var min max
+
+		var="$1"
+		shift
+		min=$1
+		max=$1
+		shift
+		while (($#)); do
+			(($1 < $min)) && min=$1
+			(($1 > $max)) && max=$1
+			shift
+		done
+		eval ${var}_min=$min
+		eval ${var}_max=$max
+	}
+
+	print_minmax()
+	{
+		local var="$1"
+		local min="${var}_min" max="${var}_max"
+
+		echo "min: ${!min}, max: ${!max}"
+	}
+
+	minmax a 3 4 2 9 4
+	print_minmax a
+
+
+
diff --git a/Command_Line_Utilities.m4 b/Command_Line_Utilities.m4 new file mode 100644 index 0000000..568b357 --- /dev/null +++ b/Command_Line_Utilities.m4 @@ -0,0 +1,571 @@ +TITLE(« + + Free yourself from all the confusion «...» Forget about desktop + decor, senseless glitter, and animations «....» Say goodbye «...» to + the rodent and welcome the ultimate interface to your computer. -- + Ratpoison propaganda + +», __file__) + + +SECTION(«Essential Command Line Tools») + +- man, apropos +- cd, pwd, ls, stat +- rm, cp, mv, ln +- echo, printf, cat +- less +- tar +- file +- find, xargs +- cut, sort, uniq, wc +- mkdir, rmdir +- df, du +- nice, ionice +- ps, kill, top, htop, jobs +- ping, wget + + +SECTION(«CMD(«gzip»), CMD(«bzip2») and CMD(«xz»)») + +EXERCISES() + +- create the file CMD(reads.fq) from the supplement using CMD(cat +> reads.fq). Discuss the usage of CMD(Ctrl-d) versus CMD(Ctrl-c) +to end the CMD(cat) process. +- run CMD(file) on CMD(reads.fq). Read it using CMD(less). Compress +the file using CMD(gzip). Run CMD(file) again. Try reading it. + +HOMEWORK(« +Reading from the special file CMD(«/dev/urandom») returns random +data. Explain what the command CMD(«head -c 10000000 /dev/urandom | +base64 > foo») does. Execute CMD(«cp foo bar») and CMD(«cp foo +baz») to create two copies of CMD(«foo»). Use the CMD(«gzip»), +CMD(«bzip2») and CMD(«xz») commands to compress the three files +using three different compression algorithms. Hand in the running +time of each command, the sizes of the compressed files and your +conclusion about which algorithm/program should be preferred. +», « +The CMD(«head | base64») command reads 10M of random data and encodes +these with the base64 algorithm, which sequentially encodes three +input bytes into four output bytes, representing the six significant +bits of each output byte as a character in the set CMD(«{a-z, A-Z, +0-9, +, /}»). The result is written to the file named CMD(«foo»). +The commands CMD(«gzip foo»), CMD(«bzip2 bar») and CMD(«xz baz») +each compress one of the files. Running times were 0.9s, 3.1s, 8.9s +respectively. It's surprising to see how big the differences of the +three running times are, although it is kind of expected that the most +"modern" program, CMD(«xz»), has the highest running time. At first +sight it might also be surprising that the sizes of the compressed +files are almost identical (all three were almost exactly 10M, the +size of the unencoded data). But when taking into account that we +are dealing with random data, it is clear that one can only expect +a compression factor of 3/4 due to the base64 encoding, no matter +which algorithm is used. For non-random data the file sizes would have +differed much more, and it depends on the use case which algorithm is +the best. For example, if the compressed file will be downloaded many +times from an FTP server, it might be worth to spend extra CPU time +during compression to make the file as small as possible in order to +save bandwith. +») + +SECTION(«CMD(«sed»), CMD(«grep»), CMD(«tr»)») + +EXERCISES() + +- Create a file with three lines (CMD(printf "one\ntwo\nthree\n" > +foo)). Use CMD(sed) or CMD(tr) to replace all newline characters with +spaces. Discuss cases where it is not an option to open the file in +an editor. +- Unpack (or recreate) the file CMD(reads.fq) from the previous +exercise. +- Extract only read2 from CMD(reads.fq) using CMD(grep). To do that, +check CMD(man grep) for the CMD(-A) and CMD(-B) options. +- Use CMD(sed) to extract only the lines containing the ID. What are +the dangers of doing it the intuitive way using grep? +- A little more advanced: use sed to write a very short, highly +efficient FastQ to FastA converter. + +HOMEWORK(« +Find a command which prints the usage of all file systems which are +NFS-mounted from a specific server (e.g., neckar, or arc). +», « +This exercise shows that things can be trickier than they look like at +first glance. The complications are (a) the string "CMD(«arc»)" may +well be part of an unrealated mount point, and (b) the server name can +be specified as either CMD(«arc») or CMD(«arc.eb.local»). Hence, +the simple CMD(«df -t nfs | grep arc») is not robust enough, at +least for scripts for which subtle future breakage (when another file +system is mounted) should be avoided. A better variant is + + df -t nfs | grep '^arc\(\.eb\.local\)*:' +») + +SECTION(«CMD(«chmod»), CMD(«chgrp»)») + +- Change permissions or group. + +EXERCISES() + +- Discuss the implications of project directories with mode 777. +- Use the -group option to CMD(find) to locate files and directories +whose GID is ebio. Discuss whether changing the GID to e.g. abt6 with +CMD(chgrp -R) would trigger a backup of these files. For the CMD(chgrp) +command, which flags besides -R would you specify in order to handle +symbolic links correctly? +- Discuss why CMD(chown) is root-only. + +HOMEWORK(« +Come up with a CMD(find | xargs chmod) command that turns on the +group-executable bit for all files which are executable for its owner, +but leaves non-executable files unchanged. Does your command work +with filenames containing spaces. Does it work if a filename contains +a newline character? +», « +CMD(«find ~ -perm /u+x -not -perm /g+x -print0 | xargs -0 chmod +g+x»). The CMD(«-not -perm /g+x») part is not strictly necessary +but it may speed up the command, and it preserves the ctime of those +files which are already group-executable. The CMD(«-print0») +option is the only way to make this type of command safe because any +other character can be part of a path. So it is strongly recommend +to always use it whenever possible. Unfortuntately this feature is +not part of the POSIX standard. It is supported on Linux and FreeBSD +(hence MacOS) though. +») + +SECTION(«The LDAP user database») +- LDAP: campus-wide user database +- stores your personal information: name, password, phone number,... +- read-only access w/o auth +- write access through web app +- ldapsearch, id, finger, last, w + +EXERCISES() + +- Run the commands CMD(finger $LOGNAME) and discuss the meaning of + all output fields. +- Run the command CMD(id). What's the meaning of the terms CMD(uid) + and CMD(gid) shown in the output? +- Show your own LDAP entry: CMD(ldapsearch -x uid=$LOGNAME). Use a + similar command to show the entry of somebody who left the + institute. How can one tell whether an account is active? +- List all abt6 users: CMD(ldapsearch -x cn=abt6) +- use CMD(ldapsearch -x) to find other people at our institute with + the same surname ("sn") as you. +- use CMD(id) to figure out what groups you are in. + +HOMEWORK(« +Find all former members of your department or group. +», « +Example for department 1: + + ldapsearch -x homeDirectory | grep -A 1 vault | grep '^homeDirectory: /ebio/abt1' +») + +SECTION(«CMD(«rsync»)») + +- file-copying tool by Andrew Tridgell (1996) +- remote copying via ssh +- synchronization +- performant, sends only differences +- aim: know the most common options + +EXERCISES() + +- Preparation: Create a scratch directory in /tmp and store 10 10M +text files there: CMD(«for i in $(seq 10); do (head -c 7500000 +/dev/urandom | base64 > $RANDOM &) ; done»). +- Create a copy (also in CMD(«/tmp»)) with CMD(«rsync») and measure +the running time: CMD(«time rsync -ax $src $dest»). Check the rsync +manual for the meaning of these two options. +- Modify and remove some files in source directory, run rsync again, +this time with the CMD(«-ax --delete») options to propagate your +changes to the destination. +- Suppose files which contain a "1" in its file name are generated +any you don't want to synchronize these. Find the correct argument +for rsync's CMD(«--exclude») option. Use the CMD(«--dry-run -v») +options to check. +- Find a way to use rsync for a non-local copy where the remote ssh +server listens on a port different than 22. To try this, forward +the port 24865 + $UID to port 22 with CMD(«ssh -NfL $((24865 + +$UID)):localhost:22 localhost») so you can ssh into localhost through +this port. +- Look at the XREFERENCE(http://people.tuebingen.mpg.de/maan/dss/, +dyadic snapshot scheduler). +- Suppose source and destination directories are on different hosts and +contain slightly different versions of a single huge file. For example, +somewhere near the beginning a small part of the file was deleted, +and another small part was appended to the end of the file. Suppose +further that the file is so large (or the network so slow) that +copying all of it would take days. Think about an algorithm that +finds the above differences without transferring the whole file. + +HOMEWORK(« +The XREFERENCE(https://en.wikipedia.org/wiki/MD4, Wikipedia page) on +the CMD(«MD4») message-digest algorithm states that the security of +CMD(«MD4») has been severely compromised. The CMD(«MD5») algorithm +is also known to be broken for years. Yet these two algorithms are +an essential part of rsync. Is this a problem for the correctness +of CMD(«rsync»)? +», « +The fact that one can find hash collisions for CMD(«MD4») +and CMD(«MD5») is usually not a problem for the integrity of +CMD(«rsync»). First, the weak hash algorithms are not used for +any type of authentication, since CMD(«rsync») relies on SSH +for this. Second, why would a malicious CMD(«rsync») user want +to modify the files on the source or destination to create a hash +collision if he already has access to these files? On the other hand, +for non-manipulated files, the probability of a hash collision is so +small that other types of failures are much more likely. Finally, the +CMD(«MD4») and CMD(«MD5») algorithms are used in combination with +other checksums, so modifying a file while keeping its CMD(«MD5») +hash the same is not enough to fool CMD(«rsync»). +») + +HOMEWORK(« +Describe the idea of rolling checksums and how they are used in +CMD(«rsync»). +») + +SECTION(«The Cron service») + +- cron daemon (CMD(«crond»)) executes scheduled commands +- CMD(«crontab») (file): table used to drive crond +- CMD(«crontab») (program): command to install, update, or list tables + +EXERCISES() + +- Preparation: Set the CMD(«EDITOR») variable to your favorite editor. +- Run CMD(«crontab -l») to see your current crontab. +- Set up a cron job which runs every minute and appends the current + date to a file. +- Note that there are two CMD(«crontab») manual pages. The command + CMD(«man crontab») gives the manual for the crontab program. Run + CMD(«man 8 crontab») to see the manual page for the configuration + file. Look at the examples at the end. +- Write a script which checks if a device is mounted and if yes + execute some operations. + +HOMEWORK(« +Discuss the difference between a cron job and a script which does +something like CMD(«while :; do run_experiment; sleep $TIMEOUT; +done»). Describe what happens when + +- the user logs out or closes the terminal, +- the server gets rebooted, +- CMD(«run_experiment») runs for more than CMD(«$TIMEOUT») seconds. +», « +There are many differences: + +- A cron job is started by the cron daemon and is executed in different +environment where some variables like CMD(«PATH») might have a +different value. +- The cron deamon is started automatically during boot, so a cron +script will still be run after a reboot while the command loop will +not be restarted automatically in this case. +- If the user logs out or closes the terminal, the command loop might +be killed. This depends on whether it was started in the background +and whether CMD(«stdin»), CMD(«stdout») and CMD(«stderr») +were redirected. +- The cron job might be started twice if the previous invocation is +still running. This can happen if a file system is slow or the job +is running for longer than the scheduling period. +- The timing of the command loop will drift over time because the +running time of the script is not taken into account. The cron script, +on the other hand, will always run at the specified times. +») + +SECTION(«Regular Expressions») + +- regex: sequence of characters that forms a search pattern +- Kleene (1956) +- compilation: RE -> finite automaton +- Chomsky hierarchy +- basic, extended, perl + +EXERCISES() + +- Understand the difference between basic, extended and perl regular +expressions. Discuss which ones should be preferred in various cases. +- In a web service, is it safe to match a user-specified +expression against a known input? Does it help if both +regex and input are bounded by, say, 1000 characters? Read the +XREFERENCE(«https://en.wikipedia.org/wiki/ReDoS», «ReDos») wikipedia +page for the answer. +- Match the "evil" extended regex CMD(«(a|aa)+») against a long +string which contains only "CMD(«a»)" characters. Try various +regex implementations (CMD(«sed»), CMD(«grep»), CMD(«awk»), +CMD(«perl»), CMD(«python»)). +- Discuss the consequences of the fact that perl regular expressions +do EMPH(«not») necessarily correspond to regular languages in the +Chomsky hierarchy. +- Is it possible to construct a regular expression that matches a +line of text if and only if the line is a syntactically correct perl +or C program? + +HOMEWORK(« +Find a regular expression which matches any number of "CMD(«a»)" +characters, followed by the EMPH(same) number of "CMD(«b»)" +characters. Alternatively, prove that no such regular expression +exists. +», « +If there was a regular expression that matched any number of +"CMD(«a»)'s" followed by the same number of "CMD(«b»)'s" there would +exist a finite automaton that describes the characters that have been +matched so far. But apparently such an automaton would necessarily +have infinitely many states. Hence no regular expression exists that +matches all such strings. Another way to prove this is to apply the +so-called EMPH(«pumping lemma»). + +The take-away of this exercise is (a) to get a feeling about what +can be described by regular expressions, and (b) that understanding +the underlying theory can help you to avoid wasting a lot of effort +on trying to do provably impossible things. +») + +SECTION(«CMD(«awk»)») + +- general purpose text utility +- Aho, Weinberger, Kernighan (1977) +- search files for certain patterns, perform actions on matching lines +- specified in POSIX, here: gawk (GNU awk) + +EXERCISES() + +- What does the following awk program do? CMD(«ls -l | awk '{x += +$5} END {print x}'») +- Check the awk uses in the +XREFERENCE(«http://ilm.eb.local/gitweb/?p=cluster;a=blob;f=scripts/admin/cmt;hb=HEAD», +«cluster management tool»). +- Write an CMD(«awk») program which prints the length of the longest +input line. +- Write an awk program which uses an associative array to print how +often different words appear in the input. +- Check the +XREFERENCE(«http://www.gnu.org/software/gawk/manual/gawk.html», +«gawk manual») to get an idea of CMD(«awk») features and its +complexity. +- Sometimes it is useful to initialize an awk variable from a shell +environment variable. Learn to use the CMD(«--assign») option to +achieve this. +- Discuss what the CMD(«--traditional») option to CMD(«awk») +does and when it should be used. + +HOMEWORK(« +On any system, determine the average file system size and the average +percentage of used space. +», « +The command CMD(«df -P») prints both the size and the percentage +of used space for each file system as the second and fifth column. +The following pipeline prints the desired average percentages: + + df -P | awk '{s2 += $2; s5 += $5} END {print s2 / NR, s5 / NR}' +») + +SECTION(«CMD(«screen»)») + +- terminal multiplexer +- Laumann, Bormann (1987), GNU +- sessions, detach, attach +- multiuser sessions + +EXERCISES() + +- Start screen, run CMD(«ls»), detach the session, log out, log in, +re-attach the session. +- In a screen session with multiple windows, type CMD(«CTRL+a "») +to see the window list. +- The key to move the cursor to the beginning of the command +line is mapped to the same character as the screen escape key, +CMD(«CTRL+a»). Learn how to workaround this ambiguity. +- Type CMD(«CTRL+a :title») to set the window title. +- Put the following line to your .screenrc: CMD(«caption always +"%{cb} «%{wb}Act: %-w%{cb}%>%n(%t)%{-}%+w%<%{cb}»%{-}"») and see +what it does. +- Learn to copy and paste from the screen scrollback buffer +(CMD(«CTRL+a ESCAPE»)). Increase the size of the scrollback buffer +in your .screenrc file. +- Learn how to use CMD(«split») and CMD(«split -v»). +- To allow your colleage CMD(«nubi») to attach your screen session +in read-only mode, create a suitable REFERENCE(«.screenrc.multi», +«multiuser screenrc») file and start a session with this config file: +CMD(«screen -C ~/.screenrc.multi -S multi»). Ask CMD(«nubi») +to run CMD(«screen -x $OWNER/multi»), where CMD(«OWNER») is your +username. Read the section on the CMD(«aclchg») command in the +screen manual for how to allow write access. Discuss the security +implications. + +SECTION(«CMD(«adu»)») + +- advanced disk usage +- creates database, only slow "once" +- produces summary or list of largest directories + +EXERCISES() + +- Create an adu database from your project directory: CMD(«adu +-C -d $DB -b $BASE»), where CMD(«$DB») is the (non-existing) +directory to create for the database, and CMD(«$BASE») is a existing +(sub-)directory of your storage project. +- List the 10 largest directories: CMD(«adu -S -d $DB -s "-m +global_list -l 10"»), then the top-10 directories with respect to +file count: CMD(«adu -S -d $DB -s "-m global_list -l 10 -s f"»). +- Print the 10 largest directories, but consider only those which +contain the letter "CMD(«a»)". +- Read the adu manual page to learn how to customize the output with +format strings. + +SECTION(«CMD(«make»)») + +- most often used to build software, useful also in other situations +- Stuart Feldman (1976), POSIX, GNU +- regenerates dependent files with minimal effort +- keeps generated files up-to-date without running your entire workflow +- Makefile abstracts out dependency tracking +- rule, recipe, target, prerequisites + +EXERCISES() + +- Look at this +XREFERENCE(«http://ilm.eb.local/gitweb/?p=user-info;a=blob;f=backup/Makefile;hb=HEAD», +«simple Makefile») which creates the bareos configuration files +from a (public) template file and a (secret) file that contains +passwords. Identify targets, recipes, rules, prerequisites. - +Create a couple of text files with CMD(«head -c 100 /dev/urandom | +base64 -w $(($RANDOM / 1000 + 1)) > $RANDOM.txt»). Write a Makefile +which creates for each CMD(«.txt») file in the current directory a +corresponding CMD(«.wc») file which contains the number of lines +in the text file. Extend the Makefile by a rule which reads the +CMD(«.wc») files to create an additional file CMD(«sum»), which +contains the sum of the line counts (CMD(«cat *.wc | awk "{s += $1} +END {print s}"»)). Draw the dependency graph which is described in +your Makefile. Modify some files (CMD(«echo hello >> $NUMBER.txt»)) +and run CMD(«make») again. Discuss the time to compute the new +sum. Add a CMD(«clean») target which lets you remove all derived +files (CMD(«.wc»), CMD(«sum»)) with CMD(«make clean»). +- There are two flavors of CMD(«make») variables. Read the +XREFERENCE(«http://www.gnu.org/software/make/manual/make.html», +«make documentation») to understand the difference. +- Look at the list of automatic CMD(«make») +variables in section on implicit rules of the +XREFERENCE(«http://www.gnu.org/software/make/manual/make.html», +«make manual»). + +HOMEWORK(« +Explain the difference between simply and recursively expanded make +variables. Discuss under which circumstances either of the two should +be used in a Makefile. +») + +SECTION(«CMD(«autoconf»)») + +- creates shell scripts that configure software packages +- makes life easier for the EMPH(users) of your software +- Mackenzie (1991), GNU, written in perl, uses m4 +- idea: test for features, not for versions + +EXERCISES() + +- Preparation: Assume for simplicity that your software package +consists only of a single script REFERENCE(«s1l», «s1l») which +prints the sha1sum of each file in the given directory. Create this +script in a scratch directory and add the REFERENCE(«configure.ac», +«configure.ac») and REFERENCE(«Makefile.in», «Makefile.in») +files as well. +- Run CMD(«autoconf») to create the configure script, then +run CMD(«./configure -h») to see the automatically generated +options. Run CMD(«configure --prefix $HOME»), CMD(«make») and +CMD(«make install») to install the "package". Notice how the value +specified to the CMD(«--prefix») option propagates from the command +line to CMD(«Makefile»). +- Draw a graph which illustrates how the generated files +depend on each other. Compare your graph with diagram on the +XREFERENCE(«https://en.wikipedia.org/wiki/Autoconf», «autoconf +wikipedia page»). In this larger diagram, identify those parts of +the graph which are present in the minimal CMD(«s1l») example. +- Suppose you spent a lot of hard work to improve your program to +use the much more secure sha2 checksum instead of sha1 (CMD(«sed +-i s/sha1/sha2/g *; mv s1l s2l»)). To give your customers the best +possible experience, you'd like the configuration step to fail +on systems where the CMD(«sha2sum») program is not installed +(notice the mantra: check for features, not versions). Add this +REFERENCE(«AC_PATH_PROG», «test») to CMD(«configure.ac») +and run CMD(«autoconf») and CMD(«./configure») again to see it +failing gracefully at configure time (rather than at runtime). +- Discuss the pros and cons of configure-time checks vs. run-time +checks. +- Change CMD(«configure.ac») to not fail any more but to create +a Makefile which installs either s1l or s2l, depending on whether +CMD(«sha2sum») is installed. +- Read the "Making configure Scripts" +and "Writing configure.ac" sections of the +XREFERENCE(«http://www.gnu.org/software/autoconf/manual/autoconf.html», +«autoconf manual»). +- Notice that CMD(«configure.ac») is in +fact written in the m4 macro language. Look at the +XREFERENCE(«http://www.gnu.org/software/m4/manual/m4.html», «m4 +manual») to get an overview. + + +SUPPLEMENTS() + +SUBSECTION(«FastQ File») + +
+  @read1
+  ATGCCAGTACA
+  +
+  DDDDDDDDDDD
+  @read2
+  ATCGTCATGCA
+  +
+  DDDDDDDDDDD
+
+
+ +SUBSECTION(«.screenrc.multi») + +
+	multiuser on
+	umask ?-wx
+	acladd nubi
+
+
+ + +SUBSECTION(«configure.ac») + +
+
+	AC_INIT(«MPI sha1 list», «0.0.0», «me@s1l.org», «s1l», «http://s1l.mpg.de/»)
+	AC_CONFIG_FILES(«Makefile»)
+	AC_OUTPUT
+
+
+ + +SUBSECTION(«Makefile.in») + +
+	all:
+		@echo "run make install to install s1l in @prefix@"
+	install:
+		install -m 755 s1l @prefix@/bin/
+
+
+ +SUBSECTION(«s1l») + +
+
+	#!/bin/sh
+	find "$1" -type f -print0 | xargs -0 sha1sum
+
+
+ +SUBSECTION(«AC_PATH_PROG») + +
+	AC_PATH_PROG(«SHA2SUM», «sha2sum»)
+	test -z "$SHA2SUM" && AC_MSG_ERROR(sha2sum required)
+
diff --git a/Debugging.m4 b/Debugging.m4 new file mode 100644 index 0000000..da805b0 --- /dev/null +++ b/Debugging.m4 @@ -0,0 +1,205 @@ +TITLE(« + + All software sucks, be it open-source of proprietary. The only + question is what can be done with particular instance of suckage, + and that's where having the source matters. -- Al Viro (2004) + +», __file__) + +SECTION(«Introduction») + +

It's safe to bet that every non-trivial program contains bugs. +Bugs in the operating system kernel are often fatal in that they +lead to a system crash which requires a reboot. However, thanks to +the concept of virtual memory, bugs in applications usually affect +neither the operating system nor independent processes which happen +to run at the same time. This makes user space programs easier to +debug than kernel code because the debugging tools will usually run +as a separate process.

+ +

We then look at valgrind and gdb, two popular tools +which help to locate bugs in application software. valgrind +is easy to use but is also limited because it does not alter the +target process. On the other hand, gdb is much more powerful +but also infamous for being hard to learn. The exercises aim to get +the reader started with both tools.

+ +

A couple of exercises on gcc, the GNU C compiler, ask the +reader to incorporate debugging information into an executable and +to see the effect of various diagnostic messages which show up when +valid but dubious code is being encountered. Warning messages can +be classified into one of two categories. First, there are +warnings which are based on static code analysis. These so-called +compile-time warnings are printed by the compiler when the +executable is being created from its source code. The second approach, +called code instrumentation, instructs the compiler to +add sanity checks to the executable, along with additional code that +prints a warning when one of these checks fails. In contrast to the +compile-time warnings, the messages of the second category show up at +run-time, and are generated by the compiled program rather +than by the compiler.

+ +EXERCISES() + + + +SUPPLEMENTS() + +SUBSECTION(«deref.c») + +
+	#include 
+	#include 
+	int main(int argc, char **argv)
+	{
+		printf("arg has %zu chars\n", strlen(argv[1]));
+	}
+
+ +SUBSECTION(«deref.sh») + +
+	#!/bin/sh
+	./deref
+
+ +SUBSECTION(«strerror.c») + +
+	#include "stdio.h"
+	#include "stdlib.h"
+	#include "string.h"
+	#include "assert.h"
+
+	/* print "system error: ", and the error string of a system call number */
+	int main(int argc, char **argv)
+	{
+		unsigned errno, i;
+		char *result = malloc(25); /* 5 * 5 */
+		/* fail early on errors or if no option is given */
+		if (errno && argc == 0)
+			exit(0);
+		errno = atoi(argv[1]);
+		sprintf(result, strerror(errno));
+		printf("system error %d: %s\n", errno, result, argc);
+	}
+
+ +SUBSECTION(«print_arg1.c») + +
+	#include 
+	#include 
+
+	static int print_it(char *arg)
+	{
+		return printf("arg is %d\n", atoi(arg));
+	}
+
+	int main(int argc, char **argv)
+	{
+		return print_it(argv[1]);
+	}
+
+ +SUBSECTION(«ubsan.c») + +
+	#include 
+	#include 
+	int main(int argc, char **argv)
+	{
+		int factor1 = atoi(argv[1]), factor2 = atoi(argv[2]),
+			product = factor1 * factor2;
+		return printf("%d * %d = %d\n", factor1, factor2, product);
+	}
+
diff --git a/Filesystems.m4 b/Filesystems.m4 new file mode 100644 index 0000000..860fc62 --- /dev/null +++ b/Filesystems.m4 @@ -0,0 +1,1471 @@ +TITLE(« + + Happy filesystems are all alike, but every corrupted filesystem is + unhappy in its own way. -- Jonathan Corbet (2011) + +», __file__) + +OVERVIEW(« + + The first part of this chapter covers general concepts related to + filesystems. This part is largely independent of the underlying + operating system. The later sections look at some aspects of two + popular local local filesystems for Linux: ext4 and xfs. The last + section contains selected topics related to the network filesystem, + nfs. +») + +SECTION(«Introduction») + +

Every Unix system has at least one filesystem which contains +the root of the tree of files. This filesystem, the root file +system, is normally stored on a local block device, for example on +an SSD. On top of that, several additional filesystems of different +types are mounted usually. For example, most Linux distributions +mount the proc and sys pseudo filesystems and several instances of +tmpfs. Other filesystems may be mounted as needed.

+ +SUBSECTION(«Classification») + +

The Linux kernel supports several dozens of different filesystems +and new ones are added frequently. Some filesystems are only employed +for special purpose computers while others are in use on almost all +systems. The /proc/filesystems pseudo file contains the +list of supported filesystems. We don't aim to provide a full listing +but classify filesystems as belonging to exactly one of the following +categories.

+ +
+
local
+ +
The filesystem is stored on a local block device, and only the + local computer can access it. Examples: ext4, xfs, fat.
+ +
pseudo
+ +
These filesystems are characterized by the absence of + backing storage. That is, there is no block device which stores the + contents. Instead, contents exist only in memory, or are provided on + demand (i.e., when the files are accessed). When the filesystem is + unmounted, all data is lost. Examples: tmpfs, sysfs, proc.
+ +
network
+ +
This type of filesystem makes files which are physically stored + on a different computer visible on the local computer. Examples: nfs, + cifs.
+ +
fuse (filesystem in user space)
+ +
Contents are provided by a user space application. Examples: + sshfs.
+ +
distributed
+ +
The contents are stored on more than one computer. Examples: + glusterfs, lustre, nfs-4.1.
+ +
+ +SUBSECTION(«POSIX Filesystems») + +

Regardless of the category, most native Unix filesystems support +the semantics prescribed by the POSIX.1-2008 standard. These include +open(2), read(2), write(2), stat(2) system calls and +many more. In particular, a POSIX filesystem must store in each +file's metadata the user and group ID of the owner and the usual +three timestamps. For compatibility reasons this is not possible for +"foreign" filesystems like Microsoft's FAT which was designed for +the single-user DOS operating system and thus has no concept of file +ownership.

+ +SUBSECTION(«User, Group, Directory and Project Quotas») + +

Early Unix systems already supported user and group quotas while +directory and project quotas are comparatively new concepts. User and +group quotas impose an upper bound on the files owned by a specific +user or Unix group, respectively. Directory quotas restrict the size +of an accounted directory. Project quotas are similar to directory +quotas but are not restricted to single directories. They are realized +as an aggregation of unrelated inodes with a specific identifier, +the project ID, which is stored in each accounted inode. +It is possible for arbitrary files to have the same project ID and +hence be accounted to the same project. The project ID is independent +from the UID and the GID, hence project accounting is independent of +user and group accounting.

+ +

For each quota type there are two configurable limits: the +inode limit which imposes a bound on the number of files +and directories owned by a specific user/group/project ID, and the +block limit which bounds the space that the same set of files +is permitted to occupy. Each limit is in fact a pair of limits: the +soft limit and the hard limit. When utilization +reaches the soft limit, a message is sent but no further action is +taken. Only if the hard limit is reached, subsequent attempts to +request more resources fail with the EDQUOT (Disk +quota exceeded) error.

+ +SECTION(«Filesystem Design») + +In this section we take a closer look at the data structures of local +filesystems, their on-disk layout, and some of the techniques that +make filesystems fast and robust. + +SUBSECTION(«Superblocks») + +

The superblock describes the geometry of the +file system. Only one superblock is needed for normal operation but +most local filesystems store backup copies of the superblock. These +extra superblocks are only accessed during recovery from filesystem +corruption, for example if the main superblock was overwritten by +accident. Although the details vary between filesystem types, the +following information is typically stored in the superblock:

+ + + +

The blkid library, libblkid, contains a database of filesystems and +of other software which stores its metadata in a specific superblock +of a block device. This includes filesystems, swap devices, physical +volumes for software raid or the logical volume manager. The library +enables applications to identify the contents of on a block device +and to extract additional information like the UUID. There are a +number of tools which are linked against this library to examine +or modify the superblocks: lsblk(8) to list block +devices, blkid(8) to print block device properties, and +wipefs(8) to overwrite (or merely print) all superblocks +of a given block device. Also the mount(8) executable is +usually linked against libblkid to support mounting by UUID instead +of device name because device names like /dev/sda might +change across reboots.

+ +SUBSECTION(«B-trees and Variants») + +

Most filesystems, including the ext4 and xfs filesystems described +in dedicated sections, employ some B-tree variant to manage their +data blocks. This is reason enough to take a closer look at the key +features of this ubiquitous data structure. We won't go into detail, +though.

+ +

B-trees were invented in the 1970s by Rudolf Bayer and Ed McCreight +as a data structure for an algorithm that can quickly access a random +block in a particular file stored on on a rotating disk. The parameters +can be tuned to minimize the number of disk operations performed, +i.e., the height of the tree.

+ +

It is unclear what the "B" in "B-tree" actually means. It certainly +does not mean "binary", because B-trees typically have a high fanout, +so nodes have way more than two children (which, by definition, +is the maximal number of child nodes for a binary tree). The typical +fanout values for filesystems range from 100 to 1000.

+ +

B-trees impose a fixed lower bound on the number of child nodes, and +an upper bound that is twice as large as the lower bound. The upper +bound is called the order of the tree. These bounds imply +an upper bound for the maximal height of the tree, given the number +of leaf nodes. Hence a B-tree is always well balanced, lookup times +are always optimal (i.e., logarithmic), and storage utilization is +at least 50%. Unlike a hash table approach there is no decrease in +performance when utilization approaches 100%.

+ +

Addition and removal of nodes is performed in a way that keeps +the tree balanced. For example, if a node is being removed and this +results in a violation of the lower bound on the number of child nodes +of the parent node, nodes are moved between siblings, or siblings +are merged.

+ +

A node with k children always has k - 1 keys. +For filesystems, the keys can be block numbers, hashes, directory +entries, or the size of block ranges. In any case, the keys act as +separators which divide the child nodes. For example, if the node +contains 3 child nodes and the two keys 23 and 42, then the left child +node contains keys less than 23, the middle child node contains keys +between 23 and 42, and the right child node contains keys greater +than 42.

+ +

Many filesystems, including xfs, do not use the classic B-tree +structure outlined above but its variant called B+tree. The +main differences between a B-tree and a B+tree are (a) data records +are stored only in leaf nodes, and (b) leaves are linked together so +that all data records can be traversed in-order by following sibling +pointers. This little detail has far-reaching consequences. Roughly +speaking, the sibling pointers prohibit copy on write for metadata +blocks.

+ +SUBSECTION(«Journaling») + +

Single filesystem operations often need to update multiple +blocks. For example, adding a new file to a directory requires three +block updates: the data block which contains the new file's contents, +the metadata block which contains the directory entries, and the +on-disk structures that manage the free blocks. Regardless of the +order in which these updates are performed, an unclean shutdown due +to a power outage or a system crash leads to a corrupt filesystem if +the shutdown happens after the first but before the third update was +performed. Journaling is a capability which avoids this situation by +making multiple block updates atomic, thereby ensuring consistency +after an unclean shutdown.

+ +

One of the first journaling filesystems was jfs, introduced 1990 by +IBM for the AIX operating system. The first journaling filesystem for +Linux was reiserfs version 3, which was included in the Linux kernel +in 2001. In the same year Linux gained support for three additional +journaling filesystems: jfs and xfs were ported to Linux from AIX +and IRIX, respectively, and the first stable version of ext3 was +released.

+ +

Journaling filesystems retain consistency after an unclean shutdown by +keeping a journal (also known as log) of operations +being performed. The journal is either stored on an separate device or +in a reserved area within the filesystem. The entries of the journal, +the log records, describe transactions, which are +filesystem operations that must be performed atomically. At the next +mount after an unclean shutdown, the journal is replayed, +that is, the recorded transactions are reapplied. The time required +to replay the journal depends only on the size of the journal and +the number of log records, but not on the size of the filesystem +or the number of files. It usually takes only a couple of seconds, +which is considerably faster than a filesystem check/repair run, which +can take hours.

+ +

Although a journaling filesystem writes metadata twice, this can +actually increase performance because metadata writes to the +journal are sequential, and when the log entries are committed, writes +can be combined and reordered, which is a win not only for rotating +disks. Since data integrity is usually less important than filesystem +integrity, only metadata (inodes, directory contents) is journaled +by default while data blocks (file contents) are written directly.

+ +SUBSECTION(«Delayed Allocation») + +

This term refers to the technique of deferring the decision of +which blocks to allocate until the last possible moment, when blocks +have to be written out due to memory pressure or an explicit sync +request from user space. This technique is employed by xfs and ext4 +but not by earlier versions of the ext* family.

+ +

Delayed allocation improves the performance of the filesystem because +the allocator has better knowledge of the eventual file size, so +files are more likely to be laid out in an optimal way: data blocks +sequentially, and close to the metadata blocks that describe the +file. Moreover, small temporary files can be fully buffered in memory +and don't cause any block allocation at all if the in-memory data +structures have already been removed when writeout triggers. Delayed +allocation is more effective on large memory systems because with +plenty of memory available, allocations can be deferred for longer.

+ +EXERCISES() + + + +HOMEWORK(« + +Describe the various file locking types mandated by POSIX-2008.1. + +») + +HOMEWORK(« + + + +») + +HOMEWORK(« + +Delayed logging is a feature of ext3 which was later ported +to xfs. The term refers to the concept of accumulating changes in +memory before writing them to the journal. Discuss the pros and cons +of delayed logging. + +») + +HOMEWORK(« + +Explain what reflink copies are and discuss when and how to +use them. + +») + +SECTION(«Alternatives to Journaling») + +

Although our focus lies in topics related to journaling +filesystems, we shall have a brief look at two different (but +related) approaches which also guarantee consistency after a crash. +Both approaches depart from the traditional way of updating data +and metadata structures. To illustrate the difference, consider +the situation where a line is appended to an existing text file +on a traditional filesystem. The filesystem first updates the data +blocks which contains the file contents. Since the file size and the +modification time have changed, the inode of the file needs to be +updated as well. Finally, the on-disk data structures which keep track +of the used and unused blocks might also need to be updated if a new +block had to be allocated for the additional line. All three updates +are usually performed in-place by overwriting the existing blocks. +The filesystems described in this section are different in that they +avoid such in-place changes.

+ +SUBSECTION(«Log-structured Filesystems») + +

A log-structured filesystem only writes sequential log entries which +describe the changes that have been made, essentially treating the +entire space as the journal. The first log-structured filesystem was +developed in the 1990s. Popular log-structured filesystems in use +today are logfs, ubifs (the unsorted block image filesystem), +and f2fs (the flash-friendly filesystem).

+ +

The layout of the data and metadata blocks of a log-structured +filesystem bears advantages and disadvantages. One advantage is that +writes are always sequential, which is particularly good for rotating +disks. However, reading a large file sequentially becomes slower +because of data fragmentation. The purely sequential writes also reduce +the decay of the storage media, particularly flash memory, because +without in-place writes, all blocks are written about the same number +of times. Other advantages of log-structured filesystem are that crash +recovery is conceptionally easy and that snapshots are natural.

+ +

The main disadvantage is the overhead incurred by garbage +collection: getting rid of old log entries that have been +superseded by later ones. This overhead is particularly large if free +space is short. Another disadvantage is related to inode management: +since inodes are scattered throughout the disk, and the location of an +inode changes whenever the file is updated, some kind of inode +map is required to locate inodes. Updates to the inode map +can be written to the log sequentially, but the current location +of the inode map must be stored in a special fixed location called +the checkpoint region so that the filesystem can recover +from a crash. Since every write to the checkpoint region causes +a seek, log-structured filesystems update the inode map only once +in a while, for example once per minute. This process is known as +checkpointing.

+ +SUBSECTION(«Copy on Write Filesystems») + +

In the context of filesystems, the term copy on write +(CoW) means to not overwrite existing data or metadata blocks as +files are modified. Instead, whenever the filesystem needs to modify +a data or metadata block, it writes the modified block to different, +currently unused location, leaving the contents of the original +block intact. Next, the metadata blocks that need to be altered +are also written to a free location without overwriting existing +metadata blocks. For example, in the scenario outlined above where the +write(2) system call extends an existing file, the three +updates for data, inode and free space information are performed as +writes to unused locations. Next, the filesystem switches to the new +metadata to commit the change. CoW filesystems are always consistent +if this last step can be performed atomically.

+ +

Two well-known open source CoW filesystem are zfs and btrfs +(the B-tree filesystem). The first stable release of zfs for +the Solaris operating system appeared in 2005, after four years of +development by Sun Microsystems. Since then zfs has been ported to +several other operating systems including FreeBSD and Linux. However, +the zfs code is licensed under the CDDL license, which is regarded +as incompatible with the GPL. For this reason, zfs is not included +in the official Linux operating system kernel but is available as +a third-party kernel module. Btrfs is another CoW filesystem which +was designed for Linux from the start. It was merged into the Linux +kernel in 2009. The feature sets of zfs and btrfs are similar, but +the implementation details vary.

+ +

CoW filesystems also have disadvantages. On a system where multiple +processes append data to different files simultaneously, the data +blocks of each file will be fragmented, which is bad for performance. +For the same reason, metadata blocks are fragmented as well, so +performance suffers if the filesystem contains many files. Another +disadvantage is related to the fact that it is difficult to tell +how much space is going to be needed for a given CoW operation, +which has caused an endless list of bugs that occur when disk space +gets tight. This is why CoW filesystems should never use more than a +certain ratio of the available space. For zfs the recommended limit +is as low as 70%.

+ +EXERCISES() + + + +SECTION(«Encryption») + +

The dm-crypt device mapper target, which was covered in the chapter on LVM, operates at the block level. +It encrypts and decrypts one block at a time, regardless of whether +the block is in use. This is in contrast to filesystem-level +encryption, where encryption is performed by the filesystem +on a per inode basis so that, for example, different files can +be encrypted with different keys.

+ +

In this section we look at two filesystem-level encryption +primitives for Linux: ecryptfs (the enterprise cryptographic +filesystem) and fscrypt (filesystem encryption). The +former was included into Linux in 2006 while the latter is much +newer. It was originally part of the flash-friendly filesystem +(f2fs) but has been made generic in 2015. Besides f2fs, also ext4 +and ubifs rely on fscrypt for encryption.

+ +

By definition, filesystem-level encryption means to encrypt the +contents of regular files. In addition to that, both ecryptfs and +fscrypt also encrypt file names. However, information stored in the +inode is left unencrypted. Therefore, without the encryption key it is +still possible to list directories as usual, but the list will contain +only encrypted filenames. Moreover, file size and timestamps can be +read as for unencrypted files with the standard stat(2) +system call.

+ +SUBSECTION(«ecryptfs») + +

ecryptfs is a so-called stacked filesystem. That is, +it relies on an (arbitrary) mounted filesystem as backend storage. An +ecryptfs mount is similar to a bind mount in that it makes the files +stored at source location (the mountpoint of the backend storage) +visible at the target location. However, the source usually contains +only encrypted files while files appear unencrypted at the target +location. Each encrypted file is self-contained in the sense that +it starts with a header which, together with the encryption key, is +sufficient to decrypt the file (and the file name). Hence encrypted +files can be copied between hosts, and encrypted files can be backed +up without telling the backup software the encryption key.

+ +SUBSECTION(«fscrypt») + +

fscrypt takes a different approach. It provides encryption +through a general library that can, in principle, be used by any +filesystem. With fscrypt it is possible to store both encrypted and +unencrypted files on the same filesystem. fscrypt has a lower memory +footprint than ecryptfs since it avoids caching filesystem contents +twice. Also, only half as many directory entries and inodes are +needed. Another advantage of fscrypt is that the fscrypt API can be +used by unprivileged users, with no need to mount a second filesystem. +The major drawback of fscrypt is that open(2) system call +fails without the key. Since backup software has to open regular files, +it is not possible to backup encrypted files without the encryption +key.

+ +EXERCISES() + + +HOMEWORK(« + +Discuss the pros and cons of filesystem level encryption vs. +block level encryption. + +») + +SECTION(«The Virtual Filesystem Switch (vfs)») + +

The main task of the vfs is to provide an abstraction for +applications to access files in a uniform way, even if the files +are stored on different filesystems. The vfs is responsible for +parsing path names received from user space via system calls, and +to forward the requests it can not handle itself to the specific +filesystem implementation, thereby associating paths with instances +of mounted filesystems. This encourages a modular filesystem design +where filesystems are opaque entities which provide a certain set of +methods called filesystem operations, for example mounting +the filesystem or opening a file. The modular design helps to avoid +code duplication, which is important for operating systems like Linux +which support many different filesystems.

+ +

The first vfs implementation was probably shipped in the Solaris +Unix System in 1985. Linux got its first vfs implementation together +with the extended filesystem, the predecessor of ext2, +ext3 and ext4. All modern operating systems have some sort of vfs, +although implementation details differ. In what follows, we shall +only discuss the Linux vfs.

+ +

Filesystems register themselves with the vfs at boot time or +when the kernel module for the filesystem is loaded. The vfs keeps +track of the available filesystem types and all mounts. To perform +efficiently, the vfs maintains several data structures which describe +the characteristics of the tree of files. We look at the most important +data structures below but leave out the rather complicated details +about how the various locking primitives (spinlocks, refcounts, RCU) +are employed to deal with concurrency.

+ +SUBSECTION(«The Dentry Cache») + +

A dentry (short for "directory entry") is a data structure +which represents a file or a directory. Dentries contain pointers to +the corresponding inode and to the parent dentry. The vfs maintains +the dentry cache, which is independent of the normal page +cache that keeps copies of file contents in memory. Dentries are kept +in hashed lists to make directory lookups fast.

+ +

On a busy system the dentry cache changes frequently. For example, +file creation, removal and rename all trigger an update of the dentry +cache. Moreover, memory pressure can cause dentries to be evicted from +the cache at any time. Clearly, some sort of coordination is needed to +keep the dentry cache consistent in view of concurrent changes, like +a file being deleted on one CPU and looked up on another. A global +lock would scale very poorly, so a more sophisticated method called +RCU-walk is employed. With RCU, lookups can be performed +without taking locks, and read operations can proceed in parallel +with concurrent writers.

+ +

The dentry cache also contains negative entries +which represent nonexistent paths which were recently looked up +unsuccessfully. When a user space program tries to access such a path +again, the ENOENT error can be returned without involving +the filesystem. Since lookups of nonexistent files happen frequently, +failing such lookups quickly enhances performance. Naturally, negative +dentries do not point to any inode.

+ +

Dentries are reference-counted. As long as there is a reference +on a dentry, it can not be pruned from the dentry cache.

+ +SUBSECTION(«File and Inode Objects») + +

Positive entries in the dentry cache point to inode +objects, which are in-memory copies of the on-disk inode +structures maintained by the filesystem. Different dentry cache entries +can map to the same inode object if the underlying filesystem supports +hard links, but entries which refer to directories are unique. The +stat(2) system call can be served without calling into +the filesystem if the path argument of the system call corresponds +to an entry of the dentry cache.

+ +

When a file is opened, the vfs allocates a file object +(also known as file description and struct file), +and adds a reference to the file object to the calling process' table +of open files. The index to this table is returned as the file +descriptor from the open(2) system call. The file +object contains a reference to the dentry and to the filesystem +specific methods for the usual operations like read(2) +and write(2). Also the file offset and the file status +flags (O_NONBLOCK, O_SYNC, etc.) are recorded in the +file object.

+ +

Like dentries, file objects are reference-counted. Once the +counter hits zero, the file object is freed. System calls like +dup(2) and fork(2) increase the reference +count while close(2) and exit(2) decrease +it. However, not all file object references correspond to file +descriptors, since also the kernel itself can hold references to a +file object. For example, the loop device driver and the ecryptfs +stacked filesystem increase the reference counter of the file objects +they work with. Passing a file descriptor from one process to another +via local sockets is another situation where the reference counters +of the affected file object need to be adjusted.

+ +

When an application calls dup(2) to duplicate a +file descriptor, the two copies refer to the same file object and +therefore share the file offset and the status flags of the file +object. An open(2) call, however, creates a new file +object even if the file is already open.

+ +SUBSECTION(«vfs Mounts and vfs Superblocks») + +

Another job of the vfs is to keep track of the tree of all mounts +and their properties. Do do so, the vfs maintains a tree of mount +structures. Whenever a filesystem is mounted, one such structure +is allocated and linked into the tree. Among other information, +the mount structure contains various pointers, including pointers to

+ + + +Other information stored in the mount structure: + + + +

Since a filesystem can be bind-mounted, there can be several mount +structures whose superblock pointers point to the same superblock. +The superblock structure contains the UUID, quota information, +granularity of timestamps, and much more.

+ +EXERCISES() + + + +HOMEWORK(« + +Describe the concept of a file change notification system and discuss +possible use cases. The Linux kernel contains three different file +change notification APIs: dnotify, inotify and fsnotify. Explain the +difference and describe in one paragraph how applications make use +of certain file descriptors to communicate with the kernel in order +to track file change events. + +») + +SECTION(«ext, ext2, ext3, ext4») + +

When the first versions of Linux were released in 1991, the kernel +relied on the minix filesystem whose source code could +be used freely in education. However, this filesystem was designed +for education rather than for real use and had severe limitations. +For example it supported only file names up to 14 characters long, +and a total size of 64M.

+ +

In 1992 the extended filesystem (ext) was released as the +first filesystem that was created specifically for Linux. It already +made use of the VFS API that was introduced at the same time. In 1993, +ext was superseded by ext2, and later by ext3 (2001) and ext4 (2008). +While ext2 is a separate filesystem, the ext3 and ext4 filesystems +share the same implementation.

+ +

Over time, many new features were added to ext3 and later to ext4. +For example, journaling was one of the main features that ext3 added +on top of ext2 while delayed allocation and on-disk data structure +checksumming came with ext4. In the remainder of this section we look +at the way the ext* filesystems lay out their data and metadata blocks +and describe some of the features of ext3 and ext4.

+ +SUBSECTION(«Block Layout») + +

All ext* filesystems lay out the space of the underlying block +device in a traditional way, inspired by the original Unix +filesystem, ufs. The available blocks are partitioned into block +groups. Each block group is typically 128M large and always +contains its own set of tables and bitmaps that manage the blocks of +the group. If feasible, the data blocks of each file are kept in the +same block group as its inode and the containing directory to avoid +unnecessary seeks. As of ext4, block groups can be combined to form +larger, so-called flexible block groups to improve metadata +locality and to have large files laid out sequentially.

+ +

Inodes are referenced in the inode table, and a bitmap keeps +track of allocated and unallocated inodes. A copy of the filesystem +superblock is stored in several other block groups since the superblock +is critical for the integrity of the filesystem.

+ +

The directory entries of an ext or ext2 filesystem are laid out +in the traditional way. That is, the directory names and associated +information like file type and the inode number are listed in the data +blocks that correspond to the directory. Directories can be imagined +as a 3-column table where each line contains an inode number, a file +type number, and the path component. Since searching a linear array +performs poorly, ext3 implemented B-trees to speed up name lookups +in large directories. The nodes of the tree are keyed by the hashes +of the names of the directory entries.

+ +SUBSECTION(«Journaling Modes») + +

If journaling is enabled, metadata updates are always journaled +(i.e., a log record is written to the journal first). However, this +is not always the case for data blocks. The data mount +option for ext3 and ext4 specifies how writeout of data blocks works. +The following three journaling modes offer different trade-offs +between speed and data integrity.

+ +
+
data=journal
+ +
This is the slowest, but also the safest journaling mode. In + this mode all data blocks are journaled just like the metadata + blocks. Since all data blocks are written twice, this journaling mode + has a substantial negative impact on performance.
+ +
data=ordered
+ +
This is the default value, which offers a good trade-off between + speed and data integrity. Ordered means that metadata blocks are only + updated after the corresponding data blocks have been written out. In + other words, data is written directly to the filesystem (rather than + the journal as with data=journal), but the update of the + corresponding metadata blocks is deferred until all data blocks have + been written.
+ +
data=writeback
+ +
This is the fastest mode of operation. No ordering between data + and metadata blocks is enforced. Filesystem integrity is guaranteed + because metadata is still journaled. However, after an unclean + shutdown and the subsequent replay of the journal, files which were + under writeback at the time of the crash may contain stale data. This + happens if the metadata blocks have been updated to report the larger + size but the corresponding data did not make it to the disk before + the crash.
+
+ +SUBSECTION(«Extents») + +

The ext2 and ext3 filesystems employ the traditional indirect block +scheme, which is basically a table of block numbers which map to the +blocks that comprise the contents of the file. One feature of ext4 +are extent trees, which are a more efficient data structure +to describe this mapping because file contents are often laid out +sequentially on the underlying block device. In this case, if the file +is large it saves metadata space if the block numbers are not stored +as a long list but as extents, that is, ranges of successive +blocks. Extents have to be organized in a tree to quickly map a given +file offset to the block number that contains the file contents at this +offset. The first few extents of the extent tree, including its root, +are stored in the inode itself. Only files which need more extents +require extra metadata blocks for the nodes of the extent tree.

+ +SUBSECTION(«Growing and Shrinking») + +

Filesystems often need to be grown, but sometimes it is also handy +to shrink a filesystem, for example to redistribute the storage between +the logical volumes of a volume group of fixed size. A feature which +distinguishes ext* from xfs is that ext2, ext3 and ext4 filesystems can +be shrunk while xfs filesystems can only be grown. However, to shrink +an ext* filesystem, the filesystem must be offline. That's in contrast +to online growing, which is supported for both ext* and xfs.

+ +

To shrink a filesystem which is stored on a logical volume, +one needs to convey the new size to both the device mapper and the +filesystem. It is usually a good idea to run fsck(8) after +the filesystem has been unmounted and before resize2fs +is run to shrink it. After the filesystem has been shrunk, the next +step is to shrink also the underlying LV. Of course the new size of +the LV must not be smaller than the size of the shrunk filesystem. To +avoid this from happening, for example due to rounding errors, +it's best to make the LV slightly larger than necessary and then +enlarge the filesystem to the maximal possible size by running +resize2fs(8) without specifying the new size.

+ +EXERCISES() + + + +HOMEWORK(« + +») + +SECTION(«The Extents Filesystem (xfs)») + +xfs is a journaling filesystem which was implemented in 1993 for the +IRIX operating system and was ported to Linux in 2001. While IRIX was +discontinued in 2005, the Linux port of xfs is actively maintained +and new features and improvements are added regularly. + +To optimize for performance, xfs departs from the traditional approach +that is followed by the ext* family. From the beginning xfs was +designed for running on high-end servers where plenty of resources +are available to max out even the largest and fastest storage systems, +and to perform well under high load when multiple threads access +the filesystem concurrently. The implementation relies on B-trees for +data, metadata and free space management. xfs is not a COW filesystem, +though, and does not do any form of block device management. Tasks like +encryption, raid or volume management are left to the corresponding +filesystem-agnostic block layer interfaces, for example MD (Software +Raid) and LVM (the logical volume manager). + +Unlike the rather static layout of the ext* filesystems, metadata on +xfs is dynamically allocated. Consequently, the metadata structures +have to be discovered at mount time by walking the filesystem +structure. Metadata blocks are self-describing in that each metadata +object contains a unique identifier, the log sequence number, +which plays the role of a timestamp. CRC32c checksums are used to +detect corruption: when the block is read, the CRC32c value is +recomputed and checked to verify to integrity of the object. + +SUBSECTION(«Allocation groups») + +Like the block groups of the ext* family, an xfs filesystem is +divided into several "mini filesystems" called Allocation +Groups (AGs). This allows xfs to handle operations in parallel, +which improves performance if many unrelated processes access the +filesystem simultaneously. New directories, are always placed in a +different AG than its parent and the inodes of the files in the new +directory are clustered around the directory if possible. + +AGs can be up to 1T large, which is much larger than the block groups +of the ext* family, but still small enough to allow relative AG +pointers to be 32 bits wide rather than 64. Each AG maintains its own +superblock and its own set of B-trees for resource management. Files +and directories are allowed to span multiple AGs, so the AG size does +not limit the maximal file size. + +The first AG is the primary AG. Its superblock is special +in that it stores the accumulated counters of all AGs. The secondary +superblocks are only consulted by xfs_repair(8). + +SUBSECTION(«Project Quota Implementation») + +

Project quotas used to be an xfs feature, but the functionality has +been made generic and is therefore available to other filesystems as +well. Besides xfs, also ext4 supports project quotas.

+ +

To limit the size of an arbitrary subtree, a special inode flag, +XFS_DIFLAG_PROJINHERIT, is used. This flag indicates +that the directory and all inodes created in the directory inherit +the project ID of the directory. Hence the act of creating a file +in a XFS_DIFLAG_PROJINHERIT marked directory associates +the new file with s a specific project ID. New directories also get +marked with XFS_DIFLAG_PROJINHERIT so the behaviour is +propagated down the directory tree.

+ +

Project quota is accounted for when moving into an accounted +directory tree, but not when moving out of a directory tree into +an unaccounted location. Moreover, one can create hard links to an +accounted file in an uncontrolled destination (as the inode is still +accounted). But it is not allowed to link from an accounted directory +into a destination with a different project ID.

+ +

Project IDs may be mapped to names through the +/etc/projid and /etc/projects configuration +files.

+ +SUBSECTION(«Speculative Preallocation») + +

As files are being written, xfs allocates extra blocks beyond the +current end of file, anticipating that further writes will arrive to +extend the file. The preallocation size is dynamic and depends mainly +on the size of the file. When the file is closed, the unneeded extra +blocks are reclaimed.

+ +

The speculatively preallocated post-EOF blocks help to minimize +file fragmentation, but they can cause confusion because they are +accounted identically to other blocks, making files appear to use +more data blocks than expected.

+ +SUBSECTION(«Reverse Mapping») + +

This feature was implemented in 2018. It adds yet another B-tree to +the xfs on-disk data structures: the reverse mapping tree, +which allows the filesystem to look up the owner of a given block +(if any). For example, if the underlying storage device reports that +a certain block went bad, and that block happens to contain contents +of a regular file, the reverse mapping tree yields the corresponding +inode and the file offset.

+ +

Another use of the reverse mapping tree is filesystem +scrubbing, a data integrity technique where a kernel thread +runs in the background to check the on-disk data structures for +consistency while the filesystem is mounted.

+ +

Since reverse mapping imposes some performance overhead, the +feature is disabled by default.

+ +SUBSECTION(«mkfs Options») + +Generally, the default settings are suitable for most workloads, +so there is usually no need for manual optimization. Nevertheless, +many xfs parameters can be tweaked at filesystem creation time. The +following list describes some options to mkfs(8). + + + +EXERCISES() + + + +HOMEWORK(« + +Summarize how the reflink feature is implemented in xfs. + +») + +HOMEWORK(« + +Explain how xfs metadumps work and which parts of the filesystem are +included in the dump. Provide a formula to estimate the expected +size of the dump, given the outputs of xfs_info(8) +and df(1). + +») + +SECTION(«The Network Filesystem (nfs)») + +The nfs service allows computers to mount a directory located on +a remote server as if it were a local disk, allowing file sharing +over a (typically local) network. On Linux, both the server and the +client are part of the operating system kernel, but there are also +nfs implementations which operate in user space. The nfs protocol +is an open specification, available as a set of RFCs, that has been +implemented on various operating systems including Linux, FreeBSD +and MacOS. Server and client can run different operating systems as +long as they both support a common nfs protocol version. + +The original nfs protocol was designed in 1984 by Sun Microsystems for +the Sun operating system (SunOS). This version was never released and +was only deployed inside the company. Protocol version 2 was released +in 1989. It is long obsolete due to its severe limitations, for example +it had a maximal file size of 2G. Its successor, protocol version 3, +was released in 1995 and fixed most of the limitations. This version +is still in use, although nfs protocol version 4 (called nfs4 in what +follows) is most frequently deployed these days. It was released +in 2000 and contains several performance, robustness and security +improvements over the older versions. The authorative resource for +the gory details of nfs4 is RFC 7530. The nfs protocol is still under +active development. Protocol version 4.1 was released in 2010 and +version 4.2 followed in 2016. + +SUBSECTION(«rpc and xdr») + +

The nfs protocols are built on top of a concept called remote +procedure call (rpc), which is based on an encoding format known +as external data representation (xdr). The rpcs which are +provided by the nfs server are closely related to filesystem-specific +system calls like read(2), write(2), link(2), rename(2), +mkdir(2) etc. Therefore an introduction to nfs naturally starts +with rpc and xdr.

+ +

The functionality of a network service can often be divided into +the low-level part and the application-level part. The low-level +part talks to the kernel to establish the connection and to send +and receive data, using system calls like socket(2), bind(2), +listen(2), connect(2), recv(2), send(2), etc. This part is +independent of the application layer which is only concerned with +the network protocol of the service. For a service like nfs which +combines more than one network protocol, it makes sense to abstract +out the common low-level part. The rpc framework was designed in 1976 +to provide such an abstraction. It supports a variety of transports +including tcp and udp. With rpc, a program running on one computer can +execute a function on a different computer. The functions that can +be called in this manner, the rpc services, are identified +by a program number and the version number. Originally developed by +Sun Microsystems in the 1980s, rpc is still in use today, sometimes +still under the old "sunrpc" name.

+ +

In general, the called procedure runs on a different system as the +calling procedure, so the client and server processes don't share the +same address space, and no memory references can be passed. Instead, +data structures must be serialized first, i.e. converted to a +certain transfer format that can be stored in a single memory buffer, +the xdr buffer, which is then sent to the server. The received xdr +buffer is de-serialized (decoded) by the server, possibly +in a different way. For example, the server might store the bytes +which describe an integer value in a different order than the client +to meet the requirements of its CPU (little/big endian). The xdr API +offers routines to convert many predefined data types (int, string, +etc.) to an xdr buffer or vice versa. This unburdens the programmer +from such details as much as possible.

+ +

To activate rpc on a system, the rpcbind(8) daemon +(formerly known as portmapper) must be running. This daemon manages +the various procedures employed by nfs such as mount, lock manager, +quota daemon, and the nfs procedure itself. It communicates with +rpc clients by listening on a well-known port. Clients first send a +get_port request to rpcbind(8) in order to +find out the port number which corresponds to the procedure they are +interested in. For example, an nfs client which intends to mount an +nfs-exported directory requests the port number of the mount procedure +from rpcbind(8). A second request is then made to actually +mount the filesystem. The exercises of this section ask the reader to +run the rpcinfo(8) tool to show the available procedures +and their port numbers on the specified server.

+ +

The input format for rpc is the rpc language (rpcl), +which is similar to C. This format fully describes the application +protocol, including all procedures and data types. RFC 7531 contains +the full xdr description of nfs4 in rpcl. The rpcgen(1) +protocol compiler generates C code from rpcl input. The C code is +then compiled to generate application code which implements the +protocol described by the input. rpcgen(8) offers +multiple application interfaces which provide different degrees of +control over the rpc internals.

+ +SUBSECTION(«Stateless and Stateful Protocols») + +

The nfs protocol versions 2 and 3 are stateless, which +means that that by design the server does not keep track of what +clients do. For example, the server does not remember which files +are currently open. Instead, the client tracks open files and the +current offset of each open file, translating application requests +into suitable protocol messages. While statelessness simplifies crash +recovery for both the client and the server, it also has downsides. For +example, file locking requires the server to maintain the existing +locks and the list of clients which are holding them. Since this +can not be done with a stateless protocol, another rpc service, +the lock daemon (lockd), was added. To recover the state of +locks after a server reboot, yet another rpc service, the status +daemon (statd), had to be introduced. This design added complexity +for no real gain, which is why nfs4 departed from the previous versions +by introducing state. With a stateful protocol it became possible to +combine all related rpc services into a single service which uses a +single TCP port. This simplifies the implementation and also allows +for compound operations where the client sends more than +one request in a singe rpc call.

+ +SUBSECTION(«Identifiers and File Handles») + +

File handles describe the file or directory a particular operation +is going to operate upon. For the nfs clients, file handles are +opaque blobs that can only be tested for equality, but which can not +be interpreted in any way. However, for the nfs server a file handle +identifies the corresponding file or directory. Most protocol requests +include a file handle. For example, the LOOKUP and MKDIR operations +both return a file handle to the nfs client.

+ +

A file handle consists of three identifiers: a filesystem ID, +an inode number and the so-called generation number. The +filesystem ID and the inode number also exist for local files. They +are derived from the statvfs structure that describes +the exported filesystem and the stat structure of the +inode, respectively. The generation number, however, is only needed +for network file systems. Roughly speaking, the generation number +counts how many times the inode has been re-used. This is necessary to +prevent clients from accessing a file through an existing file handle +if the file was deleted on the server and its inode number has been +re-used subsequently. File handles are based on leases: +The client periodically talks to the server to update its leases.

+ +

There is a deep interaction between file handles and the dentry +cache of the vfs. Without nfs, a filesystems can rely on the following +"closure" property: For any positive dentry, all its parent directories +are also positive dentries. This is no longer true if a filesystem +is exported. Therefore the filesystem maps any file handles sent to +nfs clients to disconnected dentries. Any process whose cwd +is on a local fs contributes to the reference counter of the dentry +that corresponds to the directory, and thus prevents the filesystem +from being unmounted. For nfs, this is not possible. More general: +remote applications need a way to refer to a particular dentry, +stable across renames, truncates, and server-reboot.

+ +SUBSECTION(«Attribute Caching») + +

Several rpcs return file attributes, i.e., the inode +information which is available for local filesystems through the +stat(2) system call. For example, the LOOKUP +rpc returns a new file handle and the attributes that are associated +with the given path, and the GETATTR rpc returns the +current attributes of the file which corresponds to an existing +file handle. By default, nfs clients cache these metadata. However, +since metadata can change at any time due to file operations from other +clients, the cached information can become stale. Therefore attributes +are cached for only a few seconds, which is thus the duration of the +time window during which metadata modifications caused on a different +nfs client remain undetected. Reducing this time window can result in +flooding the server with GETATTR requests while extending +it increases the chance of returning stale cached data or metadata +to the application. With the noac mount option, the +client asks the server every time it needs to assess file metadata. +However, the option also prohibits data caching, just like +the sync option. This severely impacts performance.

+ +

Changes to directories are handled similarly. To detect when +directory entries have been added or removed on the server, the +client watches the directory mtime (nfsv2 and nfsv3) or change +attribute (nfsv4). When the client detects a change, it drops +all cached attributes for that directory. Since the directory's mtime +and the change attributes are cached attributes, it may take some +time before a client notices directory changes.

+ +SUBSECTION(«Data Caching and Cache Consistency») + +

nfs clients are usually allowed to cache write operations +because the write caches increase client performance significantly +and reduce the load of the server at the same time, allowing it to +support more clients. However, one side effect of write caching is +that other clients which access the same file at the same time will +not see the changes immediately. The consistency guarantees +of a network file system describe the semantics of such concurrent +file operations.

+ +define(«caco_height», «300») +define(«caco_width», «100») +define(«caco_margin», «10») +dnl: args: y-pos, client-no, text +define(«caco_text», « + $3 +») +
+ + + caco_text(«0», «1», «open») + caco_text(«1», «1», «write») + caco_text(«2», «2», «open») + caco_text(«3», «1», «close») + caco_text(«4», «3», «open») + caco_text(«5», «2», «read») + caco_text(«6», «3», «read») + +
+ +

The nfs versions 2 and 3 provide weak cache consistency +which notifies clients about changes made by other clients before +and after an rpc. This concept turned out to be problematic, so +nfsv4 replaced weak cache consistency by close-to-open cache +consistency, which means that an nfs client is only guaranteed +to see the effects of another client's write operation if it opens +the file after the client that wrote to the file has closed +it.

+ +

To illustrate close-to-open cache consistency, consider the +scenario illustrated in the diagram on the left where three nfs clients +(as indicated by colors) access the same file. The blue client opens +the file and writes to it while the other two clients only perform read +operations. With close-to-open cache consistency the green client is +guaranteed to see the write operation of the blue client while there +is no such guarantee for the red client.

+ +SUBSECTION(«Delegations») + +nfs4 introduced a feature called file delegation. A file +delegation allows the client to treat a file temporarily as if no +other client is accessing it. Once a file has been delegated to a +client, the client might cache all write operations for this file, +and only contact the server when memory pressure forces the client +to free memory by writing back file contents. The server notifies +the client if another client attempts to access that file. + +SUBSECTION(«Silly Renames and Stale File Handles») + +

Many applications employ the following old trick to store temporary +data without leaving a stale temp file behind in case the process +crashes or is killed with SIGKILL. They create and open +a temporary file, then call unlink(2) to disassociate +the path from the filesystem tree while retaining the file descriptor +for subsequent I/O operations.

+ +

With NFS this does not work because the file descriptor exists +only on the client, and the server doesn't know about it. Consequently +the normal unlink(2) call on the server would delete +the file and free its data blocks. This is why the nfs client just +renames the file to something like .nfs12345 +if an application calls unlink(2) to remove it while it +is still open. Only after all the last file descriptor that refers +to the thusly silly-renamed file is closed, the client removes the +file by issuing an appropriate rpc.

+ +

This approach is not perfect. For one, if the client crashes, +a stale .nfs12345 file remains on the server. Second, +since silly renames are only known to the nfs client, bad things +happen if a different client removes the file.

+ + +

The file handle which an nfs client received through some earlier +rpc can become invalid at any time due to operations on a different +hosts. This happens, for example, if the file was deleted on the server +or on a different nfs client, or when the directory that contains +the file is no longer exported by the server due to a configuration +change. Subsequent attempts to use this file handle for rpcs then +fail with the ESTALE error.

+ +

The exercises below ask the reader to cause silly-renamed files, and +stale file handles.

+ +SUBSECTION(«Performance Tuning») + +There are plenty of mount options for nfs. See nfs(5) +for details. We only cover a couple of the more interesting ones with +respect to performance. + + + +EXERCISES() + + + +HOMEWORK(« + +For each POSIX lock type, discuss whether file locks of this type +work on an nfs-mounted filesystem. If they do, discuss the relevant +mount options that are necessary to make them work. + +») + +HOMEWORK(« + + + +») + +HOMEWORK(« + +Describe the purpose of the nfsd and the +rpc_pipefs pseudo filesystems. Hint: See +Documentation/filesystems/nfs/nfs.txt of the Linux +source code. + +») + +HOMEWORK(« + +Provide an overview of nfs version 4.1 (Parallel nfs). + +») + +SUPPLEMENTS() + +SUBSECTION(«Broken Rename») + +
+	fd = open("foo.new", ...);
+	write(fd, "new content", ...);
+	close(fd);
+	rename("foo.new", "foo");
+
+ +SECTION(«Further Reading») + diff --git a/Git.m4 b/Git.m4 new file mode 100644 index 0000000..c1208fa --- /dev/null +++ b/Git.m4 @@ -0,0 +1,1092 @@ +TITLE(« + + The reason people have trouble wrapping their heads around git is + because they have been braindamaged by Github and Gitlab. + +», __file__) + +SECTION(«Version Control Systems») + +The term version control (also revision control +or source control) refers to the change management +of files. For example, some sort of change management is needed if a +team of geographically dispersed people concurrently make changes to +the files that comprise a document, fixing mistakes and adding new +content over time. + +A simple form of change management is "version control by email" +where the collaborators send revised copies of the document to each +other. While this approach can work for a small group of authors, it +quickly gets messy as more people are involved. One problem arises +when more than one person change the current version at the same +time because it is unclear how these changes should be combined, +in particular if the changes conflict with each other. For example, +a conflict arises if one person adds a reference to a portion of text +while a second person rewords the referenced text and moves it to a +different chapter. One way to work around this problem is to require +that only a single person is supposed to make changes at any given +time. This person edits the files and sends the revised version to +the next author. While this approach avoids conflicts, it is highly +inefficient. Version control systems (VCSs) are software tools +which help the collaborators to maintain different versions of a set of +files and to deal with concurrent and potentially conflicting changes. + +SUBSECTION(«Centralized and Distributed Version Control Systems») + +The idea of a VCS is to track the changes that are made to a tree of +files over time. All revisions of all files are stored in a database +which is called the repository of the project. The +recorded changes are organized as commits. Besides the +file contents, each commit carries metadata about the change, like +date and time and the name and the email address of the person who +made the change. Moreover, each time a commit is being created, the +author is asked to provide a commit message, a text which +is supposed to document why this particular change was made. + +Most VCSs are content agnostic in that they do not know or +care about the types of the files that are stored in the repository. In +order to visualize the difference between two versions of a file, +they have to rely on third party tools which understand the file +format. For plain text files, they usually employ the diff +algorithm and file format. The exercises of this section invite you to +take a look at the diff(1) command and its counterpart, +patch(1). A rough understanding of the diff format is +fundamental for using any VCS. + +The basic operations offered by VCSs are to retrieve ("check out") +old versions of the tree, to list the difference ("diff") between +two revisions, and to create new revisions from modified versions +of tracked files ("check in"). Most VCSs also have the concept of +branches. Branching becomes necessary if there is no single +"master" version, but two or more different versions that have to be +maintained concurrently. For example, in a repository which contains +the source code of some software project, there might be one "stable" +branch where only bugs get fixed, while new features are developed +in another "development" branch. Another feature each VCS needs to +implement is some kind of download service which lets (authenticated) +collaborators download a copy of the repository. + +VCSs can be classified as being either centralized or +distributed. A centralized VCS is characterized by taking a +client-server approach to change management. For a centralized VCS, the +basic operations outlined above all involve the server. For example, to +check in a change, the server is contacted to record the new revision +in its database. With distributed VCSs, on the other hand, there is +no central instance. Instead, all repositories are self-contained +in that they contain the full change database. Repositories are +synchronized in a peer-to-peer fashion, which has many advantages, +including speed and scalability. This is why many people consider +centralized VCSs obsolete. + +SUBSECTION(«VCS History») + +Probably the earliest VCS was the Source Code Control System +(SCCS) which was originally developed at Bell Labs in 1972. Although +it was single file based and hence did not have the concept of +a repository, it was the dominant VCS for Unix in the 1970s and +1980s. SCCS was also used as the "backend" for newer VCSs, notably RCS +(1982) and CVS (1990). The latter was the dominant VCS of the 1990s, +at least in the open source world. It was eventually superseded +by Subversion (SVN), initially released in 2000, which is +conceptually similar to CVS. In the late 1990s the distributed VCS +emerged, and have rendered the older centralized VCSs like CVS and SVN +obsolete. As of 2018, there are several distributed VCSs under active +development, of which git (started 2005 by Linus Torvalds, +the creator of Linux) is the most popular by far. We won't discuss +other VCSs further. + +EXERCISES() + + + +HOMEWORK(« +Come up with use cases for the diff and patch +utilities which are unrelated to version control. +») + +SECTION(«Basic Git Usage») + +SUBSECTION(«Getting Help») +- CMD(«git help») shows commands. +- CMD(«git help pull») help for pull. +- CMD(«git pull -h») short overview of pull options. + +SUBSECTION(«clone, init») +- get started +- init: get a new repository +- clone: copy a repository + +EXERCISES() + +- read CMD(«git help init») and CMD(«git help clone») +- create an empty repository and clone it + +SUBSECTION(«add, commit») +- add files +- commit changes + +EXERCISES() + +- add files to both repositories +- commit changes, write commit summary +- change files again +- commit changes again + +HOMEWORK(« +- Initialize a new repository. Create an empty file + CMD(«fruits.txt»), add it to the staging area with CMD(«git + add») and commit it. +- Use CMD(«printf "apple\npear\n" >fruits.txt») to add some + fruits to the file. Add the modified file to the staging + area. +- Use CMD(«printf "orange\n" >>fruits.txt») to modify the + file again. +- CMD(«git status») will show the file twice, why? +- Which version of the file (which fruits) will be committed + by CMD(«git commit -m "new fruits arrived"»)? +- How do you get the version with oranges commited? +», « +The second CMD(«git add») command adds the "apple and pear" version +to the staging area. Appending CMD(«orange») does not change what +has been staged, so the first version is listed under "Changes to be +committed", while the just modified version is listed under "Changes +not staged for commit". A simple CMD(«git commit») will commit the +staged version. To commit the other version (with oranges) one must +add the file again and then run CMD(«git commit»). + +») + +SUBSECTION(«log») +- view commit history + +EXERCISES() + +- Look at log in both repositories + +SUBSECTION(«fetch, merge, pull») +- get changes from others +- pull is fetch + merge + +EXERCISES() + +- use 'git pull' to get both repositories into the same state +- try create an edit conflict: change both repositories and use pull +- resolve the edit conflict + +SUBSECTION(«checkout, reset») +- reset: move HEAD +- checkout: undo changes, get older version + +EXERCISES() + +- Use checkout to look at older versions of your project + +SUBSECTION(«tags, branches») +- tag a release +- branch to start a new experimental feature + +EXERCISES() + +- Create a new branch, modify files +- Use checkout to switch between master and new branch + + +SUBSECTION(«alias») +- remote: manage aliases for remote repositories + +EXERCISES() + +- use CMD(«git remote -v») on both repositories + +SECTION(«Commit Graph») + +The git version control system has been designed for +EMPH(«distributed») development where more than one person +makes changes to the source tree simultaneously and each does +so independently of the other. The history of a source tree that +evolves in this manner can not be described by a simple linked list +of changes which could sequentially be applied to the original source +tree in order to obtain the "current version" of the tree. In fact, +there is no such thing as a "current version". Moreover, in general +two commits are not related to each other in the sense that the second +commit comes after the first, or vice versa. Instead, the relationship +between commits can only be described adequately by a structure known +in graph theory as EMPH(«directed, acyclic graph») (DAG). + +Many git commands operate on the DAG that corresponds to the commits +of the repository at hand. It is therefore useful to have a rough +understanding of the basic concepts of graph theory, and of DAGs in +particular. The exercises of this section ask the reader to translate +between the abstract, mathematical notion of a graph and its concrete +realization as commits in a git repository. We cover the partial order +of a DAG, and the derived concepts of reachabilty and infimum. Another +exercise aims to get the reader fluent in git's way of specifying +sets of commits. + +EXERCISES() + + + +HOMEWORK(« +define(«ril_node_size», «13») +define(«ril_node_bgcolor», «#ccc») +define(«ril_arrow_width», «2») +define(«ril_alpha», «eval(ril_node_size() * 4 / 10)») +define(«ril_beta», «eval(ril_node_size() * 92 / 100)») dnl sqrt(1 - alpha^2) +define(«ril_node», « + + $3 +») +define(«ril_rarrow», « + +») +define(«ril_larrow», « + +») +define(«ril_varrow», « + +») +
+ + ril_node(«15», «15», «G») + ril_node(«55», «15», «H») + ril_node(«95», «15», «I») + ril_node(«135», «15», «J») + ril_node(«35», «60», «D») + ril_node(«75», «60», «E») + ril_node(«115», «60», «F») + ril_node(«75», «105», «B») + ril_node(«115», «105», «C») + ril_node(«95», «150», «A») + + ril_rarrow(«15», «15», «35», «60») + ril_larrow(«55», «15», «35», «60») + ril_rarrow(«95», «15», «115», «60») + ril_larrow(«135», «16», «115», «60») + ril_rarrow(«35», «60», «75», «105») + ril_varrow(«75», «60», «75», «105») + ril_larrow(«115», «60», «75», «105») + ril_varrow(«115», «60», «115», «105») + ril_rarrow(«75», «105», «95», «150») + ril_larrow(«115», «105», «95», «150») + +
+For each of the following revision parameters, determine the commit +it refers to with respect to the commit graph on the left. Parent +commits are ordered left-to-right. + +
+A^0, A^, A^1, A^^^2, B^3^, A~1, A^2, A^^, A^1^1, A^^2,
+B^3^2, A^^3^2.  A~2^2, A^^3^, A^^^, A^1^1^1, A~3.
+
+», « + +The suffix ^ to a revision parameter means the +first parent. ^n means the n-th parent. The suffix +~n means the n-th generation ancestor, following only +the first parents. See gitrevisions(7). + +
+A^0 = A, A^ = A^1 = B, A^^^2 = H, B^3^ = I, A~1 = B, A^2 = C,
+A^^ = A^1^1 = D, A^^2 = E, B^3^2 = A^^3^2 = J, A~2^2 = H, A^^3^ = I,
+A^^^ = A^1^1^1 = A~3 = G
+
+») + +SECTION(«Git Objects and Refs») + +Unlike centralized version control systems like CVS and SVN, each copy +of a git repository contains the full history of the source tree, +rather than only a few recent revisions. This speeds up operations +like CMD(«git log») or CMD(«git diff») because all operations +are local. It also makes it possible to work offline as no network +connection is needed for most operations. The git database, which +is hidden inside the CMD(«.git») subdirectory of the repository, +contains all revisions of all tracked files as well as meta data like +file names, access permissions and commit messages. All contents are +stored as EMPH(«git objects») and the database is indexed by the +SHA1 hash value of the objects' contents. This indexing method is +called EMPH(«content-based addressing») because the hash value of +the contents of an object is used as the lookup key for the database. + +Depending on the size of the repository, the git database may contain +millions of objects, but there are only four different types of +objects: blob, tree, commit, and tag. The exercises of this section +invite the reader to look at each object type in more detail. Another +aim is to demystify the differences between heads, tags, refs and +branches, which all denote a reference to a commit object. + +EXERCISES() + +- Recall the properties of a + XREFERENCE(«https://en.wikipedia.org/wiki/Cryptographic_hash_function», + «cryptographic hash function»). +- How many objects of each type exist in the repo created by this + REFERENCE(two_branches.bash, script)? Check with CMD(git fsck -v). +- Clone the user-info repository with CMD(«git clone + git://ilm.eb.local/user-info») and explore all files in the + CMD(«.git/refs») directory. + +HOMEWORK(« +- Learn how to manually create a commit with CMD(«git hash-object»), + CMD(«git update-index»), CMD(«git write-tree»), and CMD(«git + commit-tree»). +») + +SECTION(«The Index») + +Every version control system needs some kind of EMPH(«tree object») +which records the information about one particular state of the source +tree. A commit then corresponds to a transition from one tree object +to another and is described by an edge in the commit graph. + +Git exposes one tree object in a special staging area called the +EMPH(«index»). One can think of the index as a table which contains +one row for each tracked file, which contains the information necessary +to generate a tree object. + +Under normal circumstances each row of the index has three columns: +The permission bits of the file, the file name, and the hash value +of the file's contents. When resolving merge conflicts, however, +it is handy to have additional columns which contain the hash values +of the two conflicting versions of the file plus the hash value of +a common anchestor. + +Many git commands operate on the index. For example the command +CMD(«git commit») (with no arguments) creates a commit from the +index. It does not even look at the working tree. Another example is +CMD(«git add foo»), which updates the hash column of CMD(«foo») +in the index to match the version of CMD(«foo») in the working tree. + +From the above it should be clear that the concept of an index is +quite natural in the context of version control systems. The fact +that git exposes the index, rather than hiding it as other version +control systems do, gives the user a great deal of control over the +next commit. Being able to tweak the index as needed is a good thing +not only for conflict handling. + +The exercises of this section try to convince the reader that the index +is by no means an advanced concept that is so hard to understand that +it should be hidden from the user. + +EXERCISES() + +- In any repository, add a modified tracked file and run CMD(«git diff»), +and CMD(«git diff --cached»). +- Make two unrelated changes to the same file, then run CMD(«tig»), +CMD(«git gui») or CMD(«git add -i») to record only one of the changes to +the index. Run CMD(«git diff --cached») to verify before you commit. +- During a merge, the index contains references to up to three versions +of each file. Explain to which commits these three versions correspond. + +SECTION(«Reset») + +Resetting a branch means to let the branch head point to a different +commit. This so-called EMPH(«soft») reset operates only on the +commit graph, but it touches neither the index nor the working tree. +By default git performs a EMPH(«medium») reset which additionally +resets the index to make it match the tree object of the new +commit. Finally, a EMPH(«hard») reset additionally updates the +working tree accordingly. + +The exercises of this section try to clarify the difference between +the three different flavors of resetting a branch. + +EXERCISES() + +- In the repo created with REFERENCE(«two_branches.bash», +«script»), create a new temporary branch with CMD(«git checkout +-b tmp topic2»). Reset this branch to its parent commit with +CMD(«git reset --hard HEAD^») Repeat using the CMD(«--soft») +and CMD(--medium) options. Examine the index at each step. +- When given one or more paths, CMD(«git reset») has a different +meaning: It copies named entries from the given revision to the +index. In the two-branches repo, run CMD(«git reset HEAD^ h») and +investigate the working copy and the index with CMD(«git diff») +and CMD(«git diff --cached»). + +SECTION(«Stashing») + +

The command git reset --hard throws away any +uncommitted changes in the working tree and the index. It returns to +a clean state where index and working tree match the tree +of the HEAD commit. Sometimes, however, one would like to return to +a clean state without losing or committing the local changes.

+ +

For example, suppose that your working tree has several modified +files because you are in the middle of something. Then you notice +an unrelated flaw in one of the files. Fixing this flaw has higher +priority than your current work and should be quick and easy. But you +don't want to lose your local changes and you don't want to commit +them either because this work is not yet complete.

+ +

In this situation git stash can help you out. This +command records the current state of the working directory and the +index. The modifications can be restored later, possibly on top of +a different commit.

+ +define(«str_node_size», «13») +define(«str_node_bgcolor», «#ccc») +define(«str_arrow_width», «2») +dnl sin(pi/3) = sqrt(3)/2 = 0.866 +define(«str_offset», «eval(str_node_size() * 87 / 100)») +define(«str_node», « + + $3 +») +define(«str_arrow», « + +») +
+ + str_node(«20», «65», «H») + str_node(«80», «65», «I») + str_node(«50», «13», «W») + str_arrow(«eval(20 + str_node_size())», «65», + «eval(80 - str_node_size() - 4)», «65») + str_arrow(«eval(20 + str_node_size() / 2)», + «eval(65 - str_offset())», + «eval(50 - str_node_size() / 2 - 2)», + «eval(str_node_size() + str_offset() + 2)») + str_arrow(«eval(80 - str_node_size() / 2)», + «eval(65 - str_offset())», + «eval(50 + str_node_size() / 2 + 2)», + «eval(str_node_size() + str_offset() + 2)») + +
+ +

Stashes are stored in a git repository as illustrated in the graph +to the left. H stands for the HEAD commit, I for a commit +that records the state of the index. W is a commit which includes +the changes of the working tree, relative to the HEAD +commit. It is reasonable to store W as a child of I since usually +the staged version corresponds to an earlier version of the tree.

+ +

After git stash the index and the working tree +are both reset to H so that git status reports +a clean state. git stash pop and git stash +apply apply the changes between H and W to the current working +directory. Since the working directory might be completely different +at this point, this operation can fail. Note that neither git +stash pop nor git stash apply restore the changes +to the index recorded in the stash. For this you need to specify +the --index option. Consult git-stash(1) +for details.

+ +The exercises of this section ask the reader to practice stashing +and unstashing in the "interrupted work flow" scenario described above. + +EXERCISES() + +- Run the REFERENCE(«stash.bash», «stash example script») +below. It creates a single commit and leaves the working tree +in a dirty state. Run CMD(«git diff») and CMD(«git status») +to see the uncommited changes. Suppose at this point you realize +the typo in the first sentence ("pomacous" instead of "pomaceous"). +Run CMD(«git stash») to stash away your modifications, then fix the +typo and create a commit with this typo fix only. Run CMD(«git stash +pop») to bring back the stashed modification. This will result in +a conflict as is verified with CMD(«git status»). Fix the conflict +by editing the file, and resolve the conflict by re-adding the file +with CMD(«git add»). Commit the modification and investigate all +three commits with CMD(«git log -p»). +- Discuss the difference and the pros and cons of stashing versus +creating a commit in a temporary branch. + +SECTION(«Blame») + +Version control systems track every version of every file. Therefore +they can, at least in principle, compute for each line of a tracked +file the commit that introduced it. + +For example, this information can be used to identify the commit which +introduced a certain bug in a source file of a software project. The +commit meta data tells who made the problematic change and when, +and the commit message (hopefully) explains why the change was made +in this way, who was involved in the discussion, and who reviewed or +approved the change. This information can be very valuable because +it helps to avoid similar bugs in the future. It is common practice +to mention the ID of the problematic commit in the commit message of +the fixup commit that eliminates the bug. + +git provides a simple way to annotate any tracked text file with +the commit information. This functionality is implemented as the +EMPH(«blame») subcommand. In the simplest form, CMD(«git blame») +adds the following information to each line of the given file: + +- the (abbreviated) ID of the commit that introduced the line, +- the name of the author of that commit, +- the date and time of the commit, +- the line number. + +The exercises of this section aim to make the reader aware of the +CMD(«blame») command and to convince her that the command can be +helpful at times, and that it is actually very easy to use. + +EXERCISES() + +- Clone the user-info repo with CMD(«git clone +git://ilm.eb.local/user-info») and run CMD(«git blame +doc/user-guide/user-guide.txi») and discuss each column of the output. + +- Repeat the CMD(«blame») command, but this time add the +CMD(«--line-porcelain») option and discuss what kind of statistics +could be created from the output of this command if it was run on +all files of a large software project. + +SECTION(«Proposing and Discussing Changes») + +More often than not, even simple changes have to be tweaked to +perfection in several steps. The problem is that the text or code +always looks great to the person who just wrote it, while another +person, or even the author herself a day later, finds serious +flaws. Also, coming up with a good commit message is much easier +EMPH(«without») having all the details in short term memory. The sad +truth is that there's nothing that can be done about that, as this +is just how the human brain works. In view of this unfixable human +limitation, the highest quality is achieved by EMPH(«peer review»), +which is why the policy of many software projects demands that each +commit has to be reviewed and approved by somebody else before it +hits the master branch of a repository. + +The CMD(«send-email») subcommand of git makes peer review as easy as +it can get. It sends the specified range of commits to the recipients +given at the command line or in the commit messages. The recipients +can apply the patch series to their own local repository with CMD(«git +am») ("am" is short for "apply mailbox"). As CMD(«git send-email») +sends each commit as a separate email, controverse changes can be +discussed by simply replying to the corresponding emails. If some +particular commit triggers a discussion, it probably needs to be +adjusted according to the reviewer's comments, and the author should +send a revised version later. In this case also the commit message +should be adjusted to include a summary of the discussion. This process +can be iterated several times if necessary. Once the reviewers have +no more objections, the CMD(«Reviewed-by») tags can be added to +the final version of each patch in the series and the series can be +applied to the master branch. + +This way of discussing commits by email has several advantages. For +one, it does not force everyone into a specific development tool or +platform, as everybody can use his favorite email client to review +the patches and reply to the patch emails. Second, no additional +infrastructure is required and no accounts or access persmissions need +to be maintained. In fact, it is the EMPH(«point») of distributed +development to have no central infrastructure that could become the +single point of failure. Third, additional people can be included in +the discussion simply by adding them to the CC list. + +EXERCISES() + +- Preparation: Clone the user-info repo with CMD(«git clone +git://ilm.eb.local/user-info»). +- Create and check out a branch that points as the same commit as +the master branch: CMD(«git checkout -b user-guide origin/master»). +- Open the CMD(«doc/user-guide/user-guide.txi») file with an +editor and add a sentence to the first section. Create a commit +with CMD(«git commit -av») and send this commit to yourself with +CMD(«git send-email HEAD^..»), then check your mail. +- In practice you would send the commit to the maintainer of the repo +rather than to yourself. Assume you already did this and the maintainer +replied that your change is good, but has some minor issue. Edit +the file again and change the sentence. Run CMD(«git commit -av +--amend») to replace your commit. Double check with CMD(«git show») +that everything is fine. Then run the above CMD(«git send-email») +command again. +- Now suppose you are the maintainer of the project, and you received +the patch by email from somebody else and want to apply it to your +repository. Get rid of the commit with CMD(«git reset --hard HEAD^»), +save the patch email into the file CMD(«improvement.patch») and +copy the file to the machine on which your repo is stored. Then run +CMD(«git am improvement.patch») to apply the patch. Note: + - there is no need to remove the email headers, + - the commit message and author information stays intact, + - the SHA1 number of the commit has changed. +- If your repository is world-readable and the project maintainer +can log in into the machine on which it is stored, CMD(«git +request-pull») is an alternative to CMD(«git send-email»). Run +CMD(«git request-pull HEAD^ $PWD») to see how your pull request +looks like. Then run CMD(«git request-pull HEAD^ $PWD | mail -s +"[Pull] documentation improvements" $LOGNAME») to send the pull +request to yourself. In practice, you would of course send the pull +request to the maintainer. + +SECTION(«Remote Repositories, Push and Fetch») + +- CMD(«push»): update remote refs using local refs +- input: URL of the remote repo, refs (branches or tags) to push +- refspec, e.g., CMD(«+refs/heads/master:refs/remotes/mpi/master») +- forcing a push or fetch +- deleting branches from a remote with CMD(«git push :topic») + +EXERCISES() + +- Recall the definition of a fast-forward merge in +CMD(«gitglossary(7)»). +- In the refspec CMD(«:») which kind of objects are +allowed for CMD(«») and CMD(«»)? +- By default CMD(«git push :») fails if the remote +branch CMD(«») does not fast-forward to the commit specified by +CMD(«»). The optional leading CMD(«+») tells git to update the +destination ref anyway. Discuss why this "forced push" is dangerous. +- Create an REFERENCE(«empty_repo.bash», «empty repo») and a second +repo with REFERENCE(«two_branches.bash», «two branches»). Change +the CWD to this second repository. All operations below are to be +executed from this location. +- Push one of the branches from the two-branch repo to the empty +repo by executing a suitable CMD(«git push») command. Check out +one of the two branches and modify the commit message with CMD(«git +commit --amend»). Then run the same CMD(«push command») again and +understand the error message you get. Find two ways to force the push. +- Add a suitable CMD(«URL») line to the CMD(«.git/config») file of +the two-branch repo that lets you refer to the other repo by name. Run +CMD(«git ls-remote ») to list the branches of this repository +(where CMD(«») is the name you have chosen in the CMD(«URL») +line). +- Add a suitable CMD(«push») line to the configuration file of the +two-branch repo so that simply running CMD(«git push») pushes one +branch to the second repository. +- Remove one branch of the formerly empty repository by running a +suitable CMD(«git push :») command from the two-branch +repository. +- Discuss what should happen if the branch to be removed with +CMD(«git push :») is currently checked out in the remote +repository. Then check if git does what you suspect. + +SECTION(«Rebase») + +- purpose: polish commits for the final merge to master +- reorder commits +- rewriting commit messages +- removing, splitting and joining commits +- interactive or non-interactive +- automated testing + +EXERCISES() + +- Run the REFERENCE(«rebase_example.bash», «rebase example script») +to create a repository with a (stable) master branch, and a (rather +messy) topic branch (called "bembel") and CMD(«cd») into the top +level directory of this git repo. +- Execute CMD(«tig master bembel») to visualize the commit graph +and the indiviual commits. +- Check out the topic branch and rebase it on top of master with +CMD(«git rebase master»). Run the above CMD(«tig») command again +to see the effect. +- The topic branch contains one commit which is actually unrelated to +the topic. Identify this commit and run CMD(«git rebase -i master») +to change the order of the commits so that the unrelated commit +comes first. +- Find the SHA1 of the unrelated commit with CMD(«git log») and +merge only this commit into the master branch with CMD(«git checkout +master»), CMD(«git merge »). Note that this is a fast-forward +merge, so no merge commit is created. +- Check out the topic branch and run CMD(«git rebase -i master») +again to combine all commits to a single commit. This is done by +replacing CMD(«pick») by CMD(«squash») for all but the first +commit. Note that you are asked to combine the individual commit +messages into a single (meaningful) one. +- Create a test script (CMD(«grep bembel * || exit 1»)) that checks +that the word "bembel" is never spelled in lower case. On the topic +branch, run CMD(«git rebase -i master») and add one line between +each commit that runs your test script. Hint: Read the comments at +the bottom of the message. +- Merge the topic branch into the master branch with CMD(«git checkout +master») and CMD(«git merge bembel»). Then delete the topic branch +with CMD(«git branch -d bembel»). Does this lose any commits? + +SECTION(«Conflict Resolution») + +- conflicts can happen during CMD(«merge»), CMD(«pull»), + CMD(«stash») operations. +- conflict markers +- stages: 1: base, 2: ours, 3: theirs +- rerere: replay recorded resolution + +EXERCISES() + +- Run this REFERENCE(merge.bash, script) which creates a git repository +with two branches in a subdirectory of the current directory. Try to +merge the two branches with CMD(git merge topic1). Understand why the +command fails. Explain the output of CMD(git status), CMD(git diff) +and CMD(git diff --cached), CMD(git log -p --merge). +- The terms "merge base", "ours", "theirs" are frequently used to +denote commits during a conflicted merge. Determine these commits for +the conflict at hand. Run CMD(git ls-files -u) and CMD(git ls-tree +$X) for CMD(X=master, topic1, topic2) and describe the meaning of +each column in the output. +- Look at the two diffs from each branch: CMD(git log --merge -p) and +at the three originals: CMD(git show :$X:h), where CMD(«X=1,2,3»). +- Resolve the conflict with CMD(echo "hello all" > h), followed +by CMD(git add h). Check how the output of CMD(git status) CMD(git +ls-files -u) has changed. Run CMD(git commit -m "resolved") to record +your conflict resolution and verify with CMD(git log --graph). Describe +the meaning of the two SHA1 numbers shown in the CMD(git log -1) +output. +- Activate the EMPH(«rerere») feature by adding CMD(«enabled = +true») to the CMD(«rerere») section of your CMD(«.git/config») +file. Run CMD(«git reset --hard HEAD^») to get rid of the merge +commit and to return to the previous commit. Repeat the merge, notice: +git records the conflict and the resolution. Reset and repeat again. + +SUPPLEMENTS() + +SUBSECTION(«Diff Example») + +
 The git version control system is a powerful
+
-open source tool. Unfortunately, with more
+
-than 100 subcommands and even more command
+
-line options, it is way too difficult to use
+
+open source tool. Fortunately, thanks to the
+
+Unix course web pages it is easy to learn even
+
 for mere humans.
+ +SUBSECTION(«empty_repo.bash») + +
+	#!/bin/bash
+
+	set -e
+
+	GD=$(mktemp -d /tmp/ct-git-XXXXXX)
+	cd "$GD"
+	git init
+	echo cd "$GD"
+
+
+ +SUBSECTION(«two_branches.bash») + +
+	#!/bin/bash
+
+	set -e
+
+	GD=$(mktemp -d /tmp/ct-git-XXXXXX)
+	cd "$GD"
+	git init
+	echo hello > h
+	echo 'apples, peas' > fruits
+	git add h fruits
+	git commit -m initial
+
+	git checkout -b topic1
+	echo world >> h
+	echo apples > fruits
+	git commit -am 'add world, peas are no fruits'
+
+	git checkout -b topic2 master
+	echo people >> h
+	git commit -am 'add people'
+
+	echo cd "$GD"
+
+
+ +SUBSECTION(«merge.bash») + +
+	#!/bin/bash
+
+	set -e
+
+	GD=$(mktemp -d /tmp/ct-git-XXXXXX)
+	cd "$GD"
+	git init
+	echo hello > h
+	git add h
+	git commit -m initial
+
+	git checkout -b topic1
+	echo 'apples' > fruits
+	git add fruits
+	git commit -m fruits
+	echo 'pears' >> fruits
+	git commit -am 'more fruits'
+
+	git checkout -b topic2 master
+	echo 'peas' > vegetables
+	git add vegetables
+	git commit -m vegetables
+
+	git merge --no-edit topic1
+
+	echo Created merge example repository in:
+	echo "$PWD"
+
+ +SUBSECTION(«stash.bash») + +
+	#!/bin/bash
+
+	set -e
+
+	GD=$(mktemp -d /tmp/ct-git-XXXXXX)
+	f='apple-definition'
+	cd "$GD"
+	git init
+
+	echo 'The apple tree (Malus domestica) is a deciduous tree in the rose
+	family best known for its sweet, pomacous fruit, the apple.' > "$f"
+
+	git add "$f"
+	git commit -m 'initial draft of apple definition'
+
+	echo 'The apple tree (Malus domestica) is a deciduous tree in the rose
+	family best known for its sweet, pomacous fruit, the apple.  The tree
+	originated in Central Asia, where its wild ancestor, Malus sieversii,
+	is still found today.' > "$f"
+
+
+ +SUBSECTION(«rebase_example.bash») + +
+	#!/bin/bash
+
+	set -e
+
+	GD=$(mktemp -d /tmp/ct-git-XXXXXX)
+	cd "$GD"
+	git init
+	f1='apfelwein'
+	f2='culture'
+
+	echo 'Apfelwein or Most are German words for cider. It is also
+	regionaly known as Ebbelwoi, Äppler, Stöffsche, Apfelmost Viez,
+	and saurer Most.
+	' > "$f1"
+	git add "$f1"
+	git commit -m 'Add initial definition of Ebbelwoi.'
+
+	echo '
+	In the Frankfurt area, berries from the service tree (Sorbus
+	domestica), are added to increase astringency. This specific
+	type of Apfelwein is called Speierling.
+	' >> "$f1"
+	git commit -am 'Add section on Speierling.'
+
+	git checkout -b 'bembel'
+	echo '
+	Apfelwein is served in a "Geripptes", a glass with a lozenge cut that
+	refracts light and improves grip.
+	' > "$f2"
+	git add "$f2"
+	git commit -m 'Initial draft of culture file.'
+
+	git checkout master
+	echo '
+	The juice or must is fermented with yeast to produce an alcoholic
+	beverage usually around 6% abv.
+	' >> "$f1"
+	git commit -am 'Mention that Apfelwein is an alcoholic beverage.'
+
+	git checkout 'bembel'
+	echo '
+	Most establishments will also serve Apfelwein by the Bembel (a specific
+	Apfelwein jug), much like how beer can be purchased by the pitcher
+	in many countries.
+
+	' >> "$f2"
+	git commit -am 'Add section on bembel to culture file.'
+
+	sed -i 's/regionaly/regionally/g' "$f1"
+	git commit -am 'Fix typo in apfelwein section.'
+
+	sed -i '/^Most establishments/,$d' "$f2"
+	echo '
+	Most establishments will also serve Apfelwein by the Bembel (a specific
+	Apfelwein jug). The paunchy bembel is made from salt-glazed stoneware
+	and always has a basic grey colour with blue-painted detailing.
+	' >> "$f2"
+	git commit -am 'Rewrite section on Bembel.'
+
+	sed -i 's/bembel/Bembel/g' "$f2"
+	git commit -am 'Always spell Bembel in upper case.'
+
+	echo "cd $GD"
+
diff --git a/Gridengine.m4 b/Gridengine.m4 new file mode 100644 index 0000000..72f7711 --- /dev/null +++ b/Gridengine.m4 @@ -0,0 +1,740 @@ +TITLE(« + + The cluster makes it possible to do, in half an hour, tasks + which were completely unnecessary to do before. -- Unknown + +», __file__) + +SECTION(«Terminology») + +- *Cluster* (often also called *grid*): a group of machines cooperating +to do some work +- *Job* : what you want the cluster to do for you +- *Nodes* (also called exec hosts): The machines that actually do +the work +- *Master*: machine that knows all jobs, all nodes, their capabilities +and current load +- *SGE* (Sun gridengine): Software that runs on all relevant machines +- *Submit host*: takes a list of jobs to be executed and sends them +to the master (ilm). The master then distributes them across the +available nodes +- *Slot*: a single computing unit (one CPU core) + +
+ + + + + + + + + + + + + + + /> + + + + + + + + + + + + + + + + + + + + + <--! left arrow --> + + + + + + + + + + + + + + + + + + <--! right arrow --> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +

Users (circles) submit their jobs (squares) from the submit hosts +(white) to the master (yellow). The Master assigns for each job a +suitable execution host (grey) on which the job is scheduled.

+ +
+ +EXERCISES() +- Read this +XREFERENCE(«https://web.archive.org/web/20160506102715/https://blogs.oracle.com/templedf/entry/sun_grid_engine_for_dummies», +«introduction to SGE») + +SECTION(«Cluster hardware and setup») +- 48/64 core AMD (Opteron and Epyc), 512G-2T RAM, 25Gbit ethernet +- separate network (no internet, limited campus services) +- NFS root, local /tmp, two global temp file systems +- SGE + +EXERCISES() + +- Look at XREFERENCE(«http://ilm.eb.local/ganglia/», «web + frontend») of the ganglia monitoring system. +- Run the CMD(qhost), CMD(lscpu), CMD(free), CMD(w), CMD(htop) + commands to list nodes, print CPUs, available memory and + swap, and the load average. +- Examine all columns of the CMD(«q-charge --no-joblist + --no-globals») output. +- Open two terminals and ssh into two different cluster nodes + (note: the CMD(qhost) command prints the names of all nodes), + run CMD(touch ~/foo-$LOGNAME) on one of them to create a + file in your home directory. Check whether the file exists on + the other node by executing CMD(«ls -l ~/foo-$LOGNAME»). Do + the same with CMD(touch /tmp/foo-$LOGNAME). +- Read the section on the accounting system of the + XREFERENCE(«http://ilm.eb.local/clusterdoc/The-Accounting-System.html#The-Accounting-System», + «cluster documentation») to learn how charges are computed. + +HOMEWORK(« +Find three different ways to determine how many CPU cores +the cluster has. +», « +- Log in to any cluster node and read the message of the day. +- Run CMD(«qhost») and add up the third column. +- Run CMD(«nproc, lscpu») or CMD(«cat /proc/cpuinfo») on each + node and sum up the results. +- Run CMD(«qconf -se ») for each node and + sum up the values shown as CMD(«num_proc»). +- Run CMD(«q-gstat -s») and add the slot counts. +- Read the first sentence on the + XREFERENCE(http://ilm.eb.local/clusterdoc/, cluster documentation + main page). +- Visit the XREFERENCE(«http://ilm.eb.local/ganglia/», «ganglia») + page and subtract from the number shown as "CPUs Total" the CPU count + of the (two) servers which are not cluster nodes. +») + + +HOMEWORK(« +Read the CMD(«q-charge») manual page and learn about the +CMD(«--no-joblist») option. Write a config file for CMD(«q-charge») +to activate this option automatically. Hand in your config file. +», « +Simply create the file named CMD(«.q-chargerc») in the home directory +which contains a single line CMD(«no-joblist»). However, with this +file in place, there is no easy way to EMPH(«enable») the job list. +») + +SECTION(«Submitting and Monitoring») + +- interactive and non-interactive (batch) jobs +- CMD(«qsub»): submitting job scripts +- CMD(«qstat»): monitoring job state +- CMD(«h_vmem»), CMD(«h_rt»): Specify memory and running time +- CMD(«qdel»): removing running or waiting jobs + +EXERCISES() + +- Execute CMD(«qlogin -l h_rt=60») to get a shell on a +random(?) cluster node. +- Write in a file called REFERENCE(«testscript.sh», +«testscript.sh») with the content below. +- Look at the CMD(«qsub») man page to tell which of the following +options of CMD(«qsub») might be useful to set. CMD(«-l h_vmem -l +h_rt -cwd -j -V -N -pe»). +- Submit CMD(«testscript.sh») with CMD(«qsub -cwd testscript.sh») +- Quick! Type CMD(qstat). Depending on the current cluster load you +will either see your job in the queue (waiting), running, or no longer +there (already finished). +- After your job has finished, find out if it was successful using +CMD(qacct -j "jobID"). If you can't remember, look at the files that +were created. +- How much memory did your job use? +- Let's see what our CMD(«testscript.sh») did. Where and what is +the output of the three commands? +- Submit the same job script again and remove it with CMD(«qdel») +while the job is running or waiting. + +HOMEWORK(« +Write a submit script which prints out the host it is running +on. Submit the script and request a running time of one minute, and +500M of memory. Hand in the script, the command you specified for +submitting the script, and the output. +», « +The script only needs to contain the single line CMD(«hostname»). In +particular, the shebang (CMD(«#!/bin/sh»)) may be omitted. +») + +SECTION(«Array jobs and parallel jobs») + +- array job: a single job with many subjobs. Equivalent to a set of +jobs which all run the same job script. +- parallel job: jobs that use more than one slot (CPU core) + +EXERCISES +- Run CMD(«mkdir array_job_dir») and create 20 files in that +directory called CMD(«input-1») to CMD(«input-20») (hint: example +from last week). +- Create REFERENCE(«array_job.sh», «array_job.sh») and discuss +what the script does. +- Submit an array job to the cluster using CMD(«qsub -t 1-20 +array_job.sh»). Once all array tasks have finished, you'll find that +all your files were renamed. +- You might want to check if the jobs succeeded. Use CMD(qacct) to +check the exit codes of all jobs. Think about pipes and the commands +CMD(sort), CMD(uniq) and CMD(grep) to make it easier for you. +- Run CMD(«echo stress -c 2 | qsub -l h_rt=100») to submit a job. +Use CMD(«qstat») to find the node on which the job in running. Run +CMD(«ssh -t htop») and check how many stress processes +are running and the share of CPU time they get. Repeat, but this +time submit a parallel job by adding CMD(«-pe parallel 2») to the +options for CMD(«qsub»). + +HOMEWORK(« +Discuss when it makes sense to restrict the number of simultaneously +running jobs. +», « +One reason is to be nice to others: if you limit the number of your +jobs you don’t block other users by occupying the whole cluster. This +is only important for long running jobs though, as the SGE software +tries to balance jobs between users. Another reason is to not overload +the file server in case your jobs do heavy I/O. +») + +HOMEWORK(« +Submit the REFERENCE(«array_job.sh», «array_job.sh») script +again as an array job, but make sure that only at most two of the +10 tasks are going to run simultaneously. Hand in the corresponding +CMD(«qsub») command. +», « +The command CMD(«qsub -t 1-20 -tc 2 array_job.sh») will run at most +two of the 10 tasks simultaneously. +») + +SECTION(«Job running time and memory consumption») + +- Default: hard limit of 1G RAM, killed after one day +- Q: How long will my job run? How much memory does it need? A: +CMD(«qacct») +- Long job waiting times for high requests +- Short queue + +EXERCISES() + +- If a job needs much memory, the default of 1G might not be +enough. Find out how much memory one terminated job of yours actually +needed by running CMD(«qacct -j »). In particular, +look at CMD(«exit status») (not zero if something went wrong) +and CMD(«maxvmem») (actual memory consumption of your process). +- Submit the job script again, but this time specify CMD(«-l +h_vmem») to request more memory. Once the job is complete, compare +the CMD(«maxvmem») field of the CMD(«qacct») output and the value +specified with CMD(-l h_vmem). +- Jobs could also be much longer than the default value allows (1 +day). Use CMD(«-l h_rt») to request a longer running time. Run a +test job with default settings or a rough estimation and see if it +fails (CMD(«qacct»), exit status not zero). Look at start and end +time and compare with CMD(-l h_rt) value. Adjust CMD(«-l h_rt») +and run the job again. Reevaluate until your job ran successfully. +- If your job is very short, you might set CMD(«-l h_rt») below +1h to enter the short queue, for example CMD(«-l h_rt=0:30:0») +for 30mins maximum run time. By setting a small value for CMD(«-l +h_rt») you could use this resource and possibly get your job queued +earlier than with default values. The command CMD(«qconf -sql») +lists the names of all queues, and CMD(«qconf -sq | grep +"^._rt"») shows you the soft and the hard limit of running time. +See the section on resource limits of the CMD(«queue_conf») manual +page to learn more about the two types of limits. + +SECTION(«Queues, Queue Instances») + +

A queue is named description of the requirements a job must have to +be started on one of the nodes, like the maximal running time or the +number of slots. The queue descriptions are organized in plaintext +files called queue configurations which are managed by the +qmaster and which can be modified by privileged users by means of the + qconf(1) command.

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + Queue Instances + + + + + + + + Cluster Queue: + + + Set of Queue + + + Instances + + + + + + + Hosts + + + + + + + + +
+ +

Among other configuration parameters, a queue configuration always +contains the list of execution hosts. On each on each node of this +list one relalization of the queue, a queue instance, is +running as part of the execution damon sge_execd(8). +The list is usually described in terms of hostgroups +where each hostgroup contains execution hosts which are similar in +one aspect or another. For example, one could define the hostgroup + @core64 to contain all nodes which have 64 CPU cores. +The diagram to the left tries to illustrate these concepts.

+ +

While a running job is always associated with one queue instance, +it is recommended to not request a specific queue at job submission +time, but to let the qmaster pick a suitable queue for the job.

+ +

An execution host can host more than one queue instance, and queues +can be related to each other to form a subordination tree. +Jobs in the superordinate queue can suspend jobs in the subordinated +queue, but suspension always takes place at the queue instance level. +

+ +
+ +EXERCISES() + +- Run CMD(«qconf -sql») to see the list of all defined queues. Pick a +queue and run CMD(«qconf -sq ») to show the parameters of +the queue. Consult the CMD(«queue_conf(5)») manual page for details. +- Read the CMD(«prolog») section in CMD(«queue_conf(5)») manual +page. Examine the CMD(«/usr/local/sbin/prolog») file on the nodes and +try to understand what it actually does. See commit CMD(«0e44011d») +in the cluster repostitory for the answer. +- Run CMD(«echo stress -c 2 | qsub») to submit a job which starts two +threads. Determine the node on which the job is running, log in to +this node and examine the CPU utilization of your job. + +SECTION(«Accounting») + +- accounting file contains one record for each _finished_ job +- plain text, one line per job, entries separated by colons +- qacct: scans accounting file +- summary or per-job information +- buggy +- easy to parse "by hand" + +EXERCISES() + +- Run CMD(«qacct -o») to see the full user summary and CMD(«qacct +-o $LOGNAME -d 90») to see the summary for your own user, including +only the jobs of the last 3 months. +- Check the CMD(«accounting(5)») manual page to learn more about +the fields stored in the accounting records. +- Submit a cluster job with CMD(«echo sleep 100 | qsub -l h_vmem=200M +-l h_rt=10»), wait until it completes, then check the accounting +record for your job with CMD(«qacct -j »). In particular, +examine the CMD(«failed») and CMD(«maxvmem») fields. Compare +the output with CMD(«print_accounting_record.bash »), +where the CMD(«print_accounting_record.bash») script is shown +REFERENCE(«print_accounting_record.bash», «below»). +- Check out the XREFERENCE(«http://ilm.eb.local/stats/», «statistics +page»). Tell which histograms were created from the accounting file. +- Search for CMD(«com_stats») in the +XREFERENCE(«http://ilm.eb.local/gitweb/?p=cluster;a=blob;f=scripts/admin/cmt;hb=HEAD», +«cluster management tool») and examine how these statistics are +created. + +SECTION(«Complex Attributes») + +- used to manage limited resources +- requested via CMD(«-l») +- global, or attached to a host or queue +- predefined or user defined +- each attribute has a type and a relational operator +- requestable and consumable + +EXERCISES() + +- Run CMD(«qconf -sc») to see the complex configuration. +- Check the contents of +CMD(«/var/lib/gridengine/default/common/sge_request»). +- Run CMD(«qconf -se node444») to see the complex configuration +attached to node444. +- Discuss whether it would make sense to introduce additional complex +attributes for controlling I/O per file system. + +SECTION(«Tickets and Projects») + +- tickets: functional/share/override +- project: (name, oticket, fshare, acl) +- jobs can be submitted to projects (CMD(«qsub -P»)) + +EXERCISES() + +- Read the CMD(«sge_project») manual page to learn more about SGE +projects. +- Examine the output of CMD(«qconf -ssconf») with respect to the three +types of tickets and their weights. +- Check the CMD(«sge_priority(5)») manual page to learn more about the +three types of tickets. +- Discuss whether the SGE projects concept is helpful with respect +to accounting issues and grants (e.g., ERC). +- Discuss whether introducing override or functional share tickets +for projects is desirable. + +SECTION(«Scheduler Configuration») + +- fair share: heavy users get reduced priority +- share tree: assign priorities based on historical usage +- reservation and backfilling + +EXERCISES() + +- Run CMD(«qstat -s p -u "*"») to see all pending jobs. Examine +the order and the priority of the jobs. +- Run CMD(«qconf -ssconf») to examine the scheduler configuration. In +particular, look at the CMD(«policy_hierarchy») entry. Consult +the CMD(«sched_conf(5)») and CMD(«share_tree(5)») manual pages +for details. +- Discuss the various scheduling policies described in this +XREFERENCE(«http://gridscheduler.sourceforge.net/howto/geee.html», +«document»). +- Discuss the pros and cons to schedule preferentially to hosts which +are already running a job. That is, should CMD(«load_formula») +be CMD(«np_load_avg») (the default) or CMD(«slots»)? See +XREFERENCE(«http://arc.liv.ac.uk/SGE/howto/sge-configs.html», +«sge-configs») and CMD(«sched_conf(5)») for details. + +SUPPLEMENTS() + +SUBSECTION(«testscript.sh») + +
+	#!/bin/sh
+	sleep 100 # wait to give us time to look at the job status
+	echo "This is my output" > ./outputfile
+	echo "Where does this go?"
+	ls ./directorythatdoesnotexisthere
+
+ +SUBSECTION(«array_job.sh») + +
+	#!/bin/sh
+	# Lines beginning with #$ tell the program to use the following as
+	# option for the command.  By the way, you don't need to write this
+	# line into "testscript.sh" ;)
+	#$ -cwd
+	#$ -j y
+	#$ -l h_rt=0:1:0
+	mv input-$SGE_TASK_ID ./output-$SGE_TASK_ID
+
+ +SUBSECTION(«print_accounting_record.bash») + +
+	#!/bin/bash
+	(($# != 1)) && exit 1
+	awk -F: "{if (\$6 == $1) print \$0}" /var/lib/gridengine/default/common/accounting
+
diff --git a/Introduction.m4 b/Introduction.m4 new file mode 100644 index 0000000..5742021 --- /dev/null +++ b/Introduction.m4 @@ -0,0 +1,251 @@ +TITLE(« + Tell me and I forget. Teach me and I remember. Involve me + and I learn. – Benjamin Franklin +», __file__, «Advanced Problems in the Linux Environment») + +OVERVIEW(« + This practical training course covers basic and advanced Unix + and Linux topics. It targets scientists, programmers and system + administrators. Readers will find plenty of material and exercises + of varying difficulty. Move the pointer to the grey bar at the left + to open the navigation menu. +») + +SECTION(«About») + +ifelse(PUBLIC(), «true», «dnl public version of the pages + +

These pages were originally written to provide the necessary +background for using the IT infrastructure of the Max Planck Institute +for Developmental Biology and the Friedrich Miescher Laboratory in +Tübingen, Germany. Over time they morphed into a generic document +that was made public in December 2019.

+ +

The title is of course a pun on the famous book "Advanced +Programming in the Unix Environment" by W. Richard Stevens. While +Stevens' book centers around C programming, we try to convey +fundamental ideas without assuming substantial programming skills. +An elementary knowledge of shell and C programming is certainly +helpful, though. We also put an emphasis on Linux, which was still +in its infancy when Stevens' book was published in 1992.

+ +

All pages are served as static html files with no active +contents. They do not require javascript, work with any browser and do +not track the user in any way. In particular, we don't use cookies, +there is no "like" button, and we do not employ any web analysis +service like google analytics. Also, there are no advertisements of +any kind.

+ +SUBSECTION(«Exercises and Homeworks») + +

The exercises generally try to encourage the reader to +think about a specific topic rather than solve meaningless +problems mechanically. Many exercises suggest to examine further +literature. Longer or more challenging exercises are labelled as +homework. Solutions to homework exercises are only provided if at least +one person hands in a draft of a solution. To do so, send plain text +email to Andre Noll.

+ +SUBSECTION(«Feedback») + +

These pages get updated when errors are found, contents become +obsolete, or improvements are suggested. Feedback via the above +mailto link is appreciated. Besides solutions of homework exercises, +suggestions for additional topics or improvements of existing contents +are welcome. Please also point out unclear wording, grammar mistakes +and typos.

+ +SUBSECTION(«License») + +

This work is published under the +GNU Free +Documentation License (GFDL). You may copy, redistribute, and +modify these pages but all copies and derivatives are required to be +available under the same license.

+ +SUBSECTION(«See Also») + +

Go back to the author's home page.

+ +», « dnl internal version: different Introduction and Motivation + +

These pages aim to provide the necessary background for using the +IT infrastructure of the MPI for developmental biology and the FML. +They complement the + + User guide, + +but are meant to be a practical training course rather than just a +text document. While the contents of the user guide are relevant to +almost every member of the institute, the primary target audience of +the Unix course is scientists with an emphasis on IT. In particular, +computer scientists who intent to use the compute cluster will find +plenty of material of varying difficulty to learn basic and advanced +topics related to Unix and Linux.

+ +

We first cover the general concepts of the Unix operating system while +later chapters focus on Linux specific topics and selected command +line tools. The exercises aim to convey understanding by inviting the +reader to read background information and to think about the topic +at hand. This is in contrast to many other tutorials which provide +quick solutions to frequently asked questions, targeting users who +do not wish to spend the time necessary to gain insight. Longer or +more challenging exercises are labelled as homework.

+ +

Feedback is appreciated. In fact, significant changes to these +pages are always triggered by the users asking questions. On one +hand, this makes sure that new material stays relevant to the target +audience. On the other hand, it also helps to fine-tune the degree +of difficulty and the detail of the solutions. As a general rule, +solutions to existing exercises are only provided if at least one +person hands in a proposal. To do so, send plain text email to Andre Noll.

+ +SECTION(«Motivation») + +A quick glance at the table of contents reveals that the topics +of this course center around command line utilities rather than +software which comes with a Graphical User Interface +(GUI). While GUIs serve a purpose, especially for graphical tasks +(image manipulation, presentation writing, etc.), they are harmful +for scientific analysis. The remainder of this section explains why, +and tries to convince the reader that although Command Line +Interfaces (CLIs) can be scary in the beginning, learning how +to use them is worth the work. + +SUBSECTION(«Why GUIs are harmful») +
    + +
  • GUIs give people comfort by presenting choices.
  • + +
  • GUIs limit you by limiting choices.
  • + +
  • Science is a unique situation.
  • + +
  • As a scientist, you don't want to be influenced by the GUI + choices of the programmer.
  • + +
  • As a scientist, you know which analyses you want to + perform.
  • + +
  • In this situation, hand-holding GUIs hurt you.
  • + +
  • GUIs make you dependent on their style of hand-holding.
  • + +
  • GUIs change from version to version.
  • + +
  • GUI-based software is often commercial, so your data can + be trapped if new versions cannot read the data files of old + versions.
  • + +
  • GUIs make you less expert in your field
  • + +
  • You will get bad habits
  • + +
      + +
    • GUIs make it easy to make bad choices (analysis, + plotting. etc).
    • + +
    • GUIs make it easy to add unstructured data to your dataset + (e.g. spreadsheets with formatting as data).
    • +
    + +
  • Your colleagues do not use the GUI
  • + +
      +
    • Even if you do not program yourself, you will generate data that + is easy for programmers to work on.
    • +
    + +
+SUBSECTION(«Advantages of CLIs») +
    + +
  • You can do more without a GUI
  • + +
      +
    • As you work more with the CLI, you will see problems in a different + way. You will realize that you can do new things.
    • + +
    • Automation:
    • + +
        + +
      • Reduces your manual labor.
      • + +
      • Produces more consistent results.
      • + +
      • Allows you to work on bigger datasets.
      • + +
      +
    + +
  • You are faster without GUI
  • + +
      +
    • You will perform similar analyses over and over again. Using + GUI-based software requires many clicks. Each click is an opportunity + for a mistake.
    • + +
    • If you have to perform an analysis or make a conversion again, + you don't have to figure it out again if you have a little script.
    • + +
    • As you continue to work with the CLI, you will see problems + differently, and realize there are many ways.
    • + +
    + +
  • You will make fewer errors
  • + +
      + +
    • GUIs are easy, but there is no record of your clicks. It can be + difficult to detect a mis-click, so sometimes it is impossible + to tell what you did (for example, which options you selected for + an analysis).
    • + +
    • GUI-based software such as Excel can provide comfort, because + you "see" the data, but it hides the calculations, connections + between cells, and even which cells are entered data and which are + calculated. This has led to huge, world changing mistakes.
    • + +
    + +
  • You need the CLI
  • + +
      + +
    • Data is too big for GUIs.
    • + +
    • Almost all cutting-edge tools are CLI.
    • + +
    • You will probably need to convert formats to use new tools.
    • + +
    • You will need to do this over dozens or hundreds or thousands + of files to use the servers and the cluster.
    • + +
    + +
  • Using the CLI makes you a better scientist
  • + +
      +
    • CLI brings you closer to your data.
    • + +
    • CLI brings you closer to your analysis.
    • + +
    • CLI lets you use the most current tools.
    • + +
    • CLI makes your analyses more repeatable.
    • + +
    • CLI give you more control and accountability in your analyses.
    • + +
    • CLI makes you more expert in your field.
    • + +
    • Do not fear the CLI.
    • + +
    • Learning the CLI takes work and time, but you will only have less + time as you progress in you career.
    • +
    +
+») diff --git a/LVM.m4 b/LVM.m4 new file mode 100644 index 0000000..b02556a --- /dev/null +++ b/LVM.m4 @@ -0,0 +1,919 @@ +TITLE(« + + Who the heck is General Failure, and why is he reading my disk? -- Unknown + +», __file__) + +OVERVIEW(« + +The idea of Logical Volume Management is to decouple data and +storage. This offers great flexibility in managing storage and reduces +server downtimes because the storage may be replaced while file +systems are mounted read-write and applications are actively using +them. This chapter provides an introduction to the Linux block layer +and LVM. Subsequent sections cover selected device mapper targets. + +») + +SECTION(«The Linux Block Layer») + +

The main task of LVM is the management of block devices, so it is +natural to start an introduction to LVM with a section on the Linux +block layer, which is the central component in the Linux kernel +for the handling of persistent storage devices. The mission of the +block layer is to provide a uniform interface to different types +of storage devices. The obvious in-kernel users of this interface +are the file systems and the swap subsystem. But also stacking +device drivers like LVM, Bcache and MD access block devices +through this interface to create virtual block devices from other block +devices. Some user space programs (fdisk, dd, mkfs, ...) +also need to access block devices. The block layer allows them to +perform their task in a well-defined and uniform manner through +block-special device files.

+ +

The userspace programs and the in-kernel users interact with the block +layer by sending read or write requests. A bio is the central +data structure that carries such requests within the kernel. Bios +may contain an arbitrary amount of data. They are given to the block +layer to be queued for subsequent handling. Often a bio has to travel +through a stack of block device drivers where each driver modifies +the bio and sends it on to the next driver. Typically, only the last +driver in the stack corresponds to a hardware device.

+ +

Besides requests to read or write data blocks, there are various other +bio requests that carry SCSI commands like FLUSH, FUA (Force Unit +Access), TRIM and UNMAP. FLUSH and FUA ensure that certain data hits +stable storage. FLUSH asks the the device to write out the contents of +its volatile write cache while a FUA request carries data that should +be written directly to the device, bypassing all caches. UNMAP/TRIM is +a SCSI/ATA command which is only relevant to SSDs. It is a promise of +the OS to not read the given range of blocks any more, so the device +is free to discard the contents and return arbitrary data on the +next read. This helps the device to level out the number of times +the flash storage cells are overwritten (wear-leveling), +which improves the durability of the device.

+ +

The first task of the block layer is to split incoming bios if +necessary to make them conform to the size limit or the alignment +requirements of the target device, and to batch and merge bios so that +they can be submitted as a unit for performance reasons. The thusly +processed bios then form an I/O request which is handed to an +I/O scheduler (also known as elevator).

+ +

At this time of writing (2018-11) there exist two different sets +of schedulers: the traditional single-queue schedulers and the +modern multi-queue schedulers, which are expected to replace the +single-queue schedulers soon. The three single-queue schedulers, +noop, deadline and cfq (complete fair queueing), were designed for +rotating disks. They reorder requests with the aim to minimize seek +time. The newer multi-queue schedulers, mq-deadline, kyber, and bfq +(budget fair queueing), aim to max out even the fastest devices. As +implied by the name "multi-queue", they implement several request +queues, the number of which depends on the hardware in use. This +has become necessary because modern storage hardware allows multiple +requests to be submitted in parallel from different CPUs. Moreover, +with many CPUs the locking overhead required to put a request into +a queue increases. Per-CPU queues allow for per-CPU locks, which +decreases queue lock contention.

+ +

We will take a look at some aspects of the Linux block layer and on +the various I/O schedulers. An exercise on loop devices enables the +reader to create block devices for testing. This will be handy in +the subsequent sections on LVM specific topics.

+ +EXERCISES() + +
    + +
  • Run find /dev -type b to get the list of all block + devices on your system. Explain which is which.
  • + +
  • Examine the files in /sys/block/sda, in + particular /sys/block/sda/stat. Search the web for + Documentation/block/stat.txt for the meaning of the + numbers shown. Then run iostat -xdh sda 1.
  • + +
  • Examine the files in /sys/block/sda/queue.
  • + +
  • Find out how to determine the size of a block device.
  • + +
  • Figure out a way to identify the name of all block devices which + correspond to SSDs (i.e., excluding any rotating disks).
  • + +
  • Run lsblk and discuss + the output. Too easy? Run lsblk -o + KNAME,PHY-SEC,MIN-IO,OPT-IO,PHY-SEC,LOG-SEC,RQ-SIZE,ROTA,SCHED +
  • + +
  • What's the difference between a task scheduler and an I/O + scheduler?
  • + +
  • Why are I/O schedulers also called elevators?
  • + +
  • How can one find out which I/O schedulers are supported on a + system and which scheduler is active for a given block device?
  • + +
  • Is it possible (and safe) to change the I/O scheduler for a + block device while it is in use? If so, how can this be done?
  • + +
  • The loop device driver of the Linux kernel allows privileged + users to create a block device from a regular file stored on a file + system. The resulting block device is called a loop device. + Create a 1G large temporary file containing only zeroes. Run a suitable + losetup(8) command to create a loop device from the + file. Create an XFS file system on the loop device and mount it.
  • + +
+ +HOMEWORK(« + +
    +
  • Come up with three different use cases for loop devices.
  • + +
  • Given a block device node in /dev, how can one + tell that it is a loop device?
  • + +
  • Describe the connection between loop devices created by + losetup(8) and the loopback device used for network + connections from the machine to itself.
  • + +
+») + +define(«svg_disk», « + + + + + + +») + +SECTION(«Physical and Logical Volumes, Volume Groups») + +

Getting started with the Logical Volume Manager (LVM) requires to +get used to a minimal set of vocabulary. This section introduces +the words named in the title of the section, and a couple more. +The basic concepts of LVM are then described in terms of these words.

+ +
+define(lvm_width», «300») +define(«lvm_height», «183») +define(«lvm_margin», «10») +define(«lvm_extent_size», «10») +define(«lvm_extent», « + +») +dnl $1: color, $2: x, $3: y, $4: number of extents +define(«lvm_extents», « + ifelse(«$4», «0», «», « + lvm_extent(«$1», «$2», «$3») + lvm_extents(«$1», eval($2 + lvm_extent_size() + lvm_margin()), + «$3», eval($4 - 1)) + ») +») +dnl $1: x, $2: y, $3: number of extents, $4: disk color, $5: extent color +define(«lvm_disk», « + ifelse(eval(«$3» > 3), «1», « + pushdef(«h», «eval(7 * lvm_extent_size())») + pushdef(«w», «eval(($3 + 1) * lvm_extent_size())») + », « + pushdef(«h», «eval(3 * lvm_extent_size() + lvm_margin())») + pushdef(«w», «eval($3 * lvm_extent_size() * 2)») + ») + svg_disk(«$1», «$2», «w()», «h()», «$4») + ifelse(eval(«$3» > 3), «1», « + pushdef(«n1», eval(«$3» / 2)) + pushdef(«n2», eval(«$3» - n1())) + lvm_extents(«$5», + eval(«$1» + (w() - (2 * n1() - 1) * lvm_extent_size()) / 2), + eval(«$2» + h() / 2 - lvm_extent_size()), «n1()») + lvm_extents(«$5», + eval(«$1» + (w() - (2 * n2() - 1) * lvm_extent_size()) / 2), + eval(«$2» + h() / 2 + 2 * lvm_extent_size()), «n2()») + popdef(«n1») + popdef(«n2») + », « + lvm_extents(«$5», + eval(«$1» + (w() - (2 * «$3» - 1) * lvm_extent_size()) / 2), + eval(«$2» + h() / 2), «$3») + ») + popdef(«w») + popdef(«h») +») + + + lvm_disk(«20», «20», «2», «#666», «yellow») + lvm_disk(«10», «90», «4», «#666», «yellow») + lvm_disk(«70», «55», «5», «#666», «yellow») + + lvm_disk(«190», «22», «7», «#66f», «orange») + lvm_disk(«220», «130», «1», «#66f», «orange») + +
+ +

A Physical Volume (PV, grey) is an arbitrary block device which +contains a certain metadata header (also known as superblock) +at the start. PVs can be partitions on a local hard disk or a SSD, +a soft- or hardware raid, or a loop device. LVM does not care. +The storage space on a physical volume is managed in units called +Physical Extents (PEs, yellow). The default PE size is 4M.

+ +

A Volume Group (VG, green) is a non-empty set of PVs with +a name and a unique ID assigned to it. A PV can but doesn't need to +be assigned to a VG. If it is, the ID of the associated VG is stored +in the metadata header of the PV.

+ +

A  Logical Volume (LV, blue) is a named block device which is +provided by LVM. LVs are always associated with a VG and are stored +on that VG's PVs. Since LVs are normal block devices, file systems +of any type can be created on them, they can be used as swap storage, +etc. The chunks of a LV are managed as Logical Extents (LEs, +orange). Often the LE size equals the PE size. For each LV there is +a mapping between the LEs of the LV and the PEs of the underlying +PVs. The PEs can spread multiple PVs.

+ +

VGs can be extended by adding additional PVs to it, or reduced by +removing unused devices, i.e., those with no PEs allocated on them. PEs +may be moved from one PV to another while the LVs are active. LVs +may be grown or shrunk. To grow a LV, there must be enough space +left in the VG. Growing a LV does not magically grow the file system +stored on it, however. To make use of the additional space, a second, +file system specific step is needed to tell the file system that it's +underlying block device (the LV) has grown.

+ +

The exercises of this section illustrate the basic LVM concepts +and the essential LVM commands. They ask the reader to create a VG +whose PVs are loop devices. This VG is used as a starting point in +subsequent chapters.

+ +EXERCISES() + +
    + +
  • Create two 5G large loop devices /dev/loop1 + and /dev/loop2. Make them PVs by running + pvcreate. Create a VG tvg (test volume group) + from the two loop devices and two 3G large LVs named tlv1 + and tlv2 on it. Run the pvcreate, vgcreate, + and lvcreate commands with -v to activate + verbose output and try to understand each output line.
  • + +
  • Run pvs, vgs, lvs, lvdisplay, pvdisplay and examine + the output.
  • + +
  • Run lvdisplay -m to examine the mapping of logical + extents to PVs and physical extents.
  • + +
  • Run pvs --segments -o+lv_name,seg_start_pe,segtype + to see the map between physical extents and logical extents.
  • + +
+ +HOMEWORK(« + +In the above scenario (two LVs in a VG consisting of two PVs), how +can you tell whether both PVs are actually used? Remove the LVs +with lvremove. Recreate them, but this time use the +--stripes 2 option to lvcreate. Explain +what this option does and confirm with a suitable command. + +») + +SECTION(«Device Mapper and Device Mapper Targets») + +

The kernel part of the Logical Volume Manager (LVM) is called +device mapper (DM), which is a generic framework to map +one block device to another. Applications talk to the Device Mapper +via the libdevmapper library, which issues requests +to the /dev/mapper/control character device using the +ioctl(2) system call. The device mapper is also accessible +from scripts via the dmsetup(8) tool.

+ +

A DM target represents one particular mapping type for ranges +of LEs. Several DM targets exist, each of which which creates and +maintains block devices with certain characteristics. In this section +we take a look at the dmsetup tool and the relatively +simple mirror target. Subsequent sections cover other targets +in more detail.

+ +EXERCISES() + +
    + +
  • Run dmsetup targets to list all targets supported + by the currently running kernel. Explain their purpose and typical + use cases.
  • + +
  • Starting with the tvg VG, remove tlv2. + Convince yourself by running vgs that tvg + is 10G large, with 3G being in use. Run pvmove + /dev/loop1 to move the used PEs of /dev/loop1 + to /dev/loop2. After the command completes, run + pvs again to see that /dev/loop1 has no + more PEs in use.
  • + +
  • Create a third 5G loop device /dev/loop3, make it a + PV and extend the VG with vgextend tvg /dev/loop3. Remove + tlv1. Now the LEs of tlv2 fit on any + of the three PVs. Come up with a command which moves them to + /dev/loop3.
  • + +
  • The first two loop devices are both unused. Remove them from + the VG with vgreduce -a. Why are they still listed in + the pvs output? What can be done about that?
  • + +
+ +HOMEWORK(« + +As advertised in the introduction, LVM allows the administrator to +replace the underlying storage of a file system online. This is done +by running a suitable pvmove(8) command to move all PEs of +one PV to different PVs in the same VG. + +
    + +
  • Explain the mapping type of dm-mirror.
  • + +
  • The traditional way to mirror the contents of two or more block + devices is software raid 1, also known as md raid1 ("md" + is short for multi-disk). Explain the difference between md raid1, + the dm-raid target which supports raid1 and other raid levels, and + the dm-mirror target.
  • + +
  • Guess how pvmove is implemented on top of + dm-mirror. Verify your guess by reading the "NOTES" section of the + pvmove(8) man page.
  • + +
+») + +SECTION(«LVM Snapshots») + +

LVM snapshots are based on the CoW optimization +strategy described earlier in the chapter on Unix +Concepts. Creating a snapshot means to create a CoW table of +the given size. Just before a LE of a snapshotted LV is about to be +written to, its contents are copied to a free slot in the CoW +table. This preserves an old version of the LV, the snapshot, which +can later be reconstructed by overlaying the CoW table atop the LV. + +

Snapshots can be taken from a LV which contains a mounted file system, +while applications are actively modifying files. Without coordination +between the file system and LVM, the file system most likely has memory +buffers scheduled for writeback. These outstanding writes did not make +it to the snapshot, so one can not expect the snapshot to contain a +consistent file system image. Instead, it is in a similar state as a +regular device after an unclean shutdown. This is not a problem for +XFS and EXT4, as both are journalling file systems, which +were designed with crash recovery in mind. At the next mount after a +crash, journalling file systems replay their journal, which results +in a consistent state. Note that this implies that even a read-only +mount of the snapshot device has to write to the device.

+ +EXERCISES() + +
    + +
  • In the test VG, create a 1G large snapshot named + snap_tlv1 of the tlv1 VG by using the + -s option to lvcreate(8). Predict how much + free space is left in the VG. Confirm with vgs tvg.
  • + +
  • Create an EXT4 file system on tlv1 by running + mkfs.ext4 /dev/tvg/lv1. Guess how much of the snapshot + space has been allocated by this operation. Check with lvs + tvg1/snap_lv1.
  • + +
  • Remove the snapshot with lvremove and recreate + it. Repeat the previous step, but this time run mkfs.xfs + to create an XFS file system. Run lvs tvg/snap_lv1 + again and compare the used snapshot space to the EXT4 case. Explain + the difference.
  • + +
  • Remove the snapshot and recreate it so that both tlv1 + and snap_tlv1 contain a valid XFS file system. Mount + the file systems on /mnt/1 and /mnt/2.
  • + +
  • Run dd if=/dev/zero of=/mnt/1/zero count=$((2 * 100 * + 1024)) to create a 100M large file on tlv1. Check + that /mnt/2 is still empty. Estimate how much of the + snapshot space is used and check again.
  • + +
  • Repeat the above dd command 5 times and run + lvs again. Explain why the used snapshot space did not + increase.
  • + +
  • It is possible to create snapshots of snapshots. This is + implemented by chaining together CoW tables. Describe the impact on + performance.
  • + +
  • Suppose a snapshot was created before significant modifications + were made to the contents of the LV, for example an upgrade of a large + software package. Assume that the user wishes to permanently return to + the old version because the upgrade did not work out. In this scenario + it is the snapshot which needs to be retained, rather than the original + LV. In view of this scenario, guess what happens on the attempt to + remove a LV which is being snapshotted. Unmount /mnt/1 + and confirm by running lvremove tvg/lv1.
  • + +
  • Come up with a suitable lvconvert command which + replaces the role of the LV and its snapshot. Explain why this solves + the "bad upgrade" problem outlined above.
  • + +
  • Explain what happens if the CoW table fills up. Confirm by + writing a file larger than the snapshot size.
  • + +
+ +SECTION(«Thin Provisioning») + +

The term "thin provisioning" is just a modern buzzword for +over-subscription. Both terms mean to give the appearance of having +more resources than are actually available. This is achieved by +on-demand allocation. The thin provisioning implementation of Linux +is implemented as a DM target called dm-thin. This code +first made its appearance in 2011 and was declared as stable two +years later. These days it should be safe for production use.

+ +

The general problem with thin provisioning is of course that bad +things happen when the resources are exhausted because the demand has +increased before new resources were added. For dm-thin this can happen +when users write to their allotted space, causing dm-thin to attempt +allocating a data block from a volume which is already full. This +usually leads to severe data corruption because file systems are +not really prepared to handle this error case and treat it as if the +underlying block device had failed. dm-thin does nothing to prevent +this, but one can configure a low watermark. When the +number of free data blocks drops below the watermark, a so-called +dm-event will be generated to notice the administrator.

+ +

One highlight of dm-thin is its efficient support for an arbitrary +depth of recursive snapshots, called dm-thin snapshots +in this document. With the traditional snapshot implementation, +recursive snapshots quickly become a performance issue as the depth +increases. With dm-thin one can have an arbitrary subset of all +snapshots active at any point in time, and there is no ordering +requirement on activating or removing them.

+ +

The block devices created by dm-thin always belong to a thin +pool which ties together two LVs called the metadata LV +and the data LV. The combined LV is called the thin pool +LV. Setting up a VG for thin provisioning is done in two steps: +First the standard LVs for data and the metatdata are created. Second, +the two LVs are combined into a thin pool LV. The second step hides +the two underlying LVs so that only the combined thin pool LV is +visible afterwards. Thin provisioned LVs and dm-thin snapshots can +then be created from the thin pool LV with a single command.

+ +

Another nice feature of dm-thin are external snapshots. +An external snapshot is one where the origin for a thinly provisioned +device is not a device of the pool. Arbitrary read-only block +devices can be turned into writable devices by creating an external +snapshot. Reads to an unprovisioned area of the snapshot will be passed +through to the origin. Writes trigger the allocation of new blocks as +usual with CoW. One use case for this is VM hosts which run their VMs +on thinly-provisioned volumes but have the base image on some "master" +device which is read-only and can hence be shared between all VMs.

+ +EXERCISES() + +

Starting with the tvg VG, create and test a thin pool LV +by performing the following steps. The "Thin Usage" section of +lvmthin(7) will be helpful. + +

    + +
  • Remove the tlv1 and tlv2 LVs.
  • + +
  • Create a 5G data LV named tdlv (thin data LV) + and a 500M LV named tmdlv (thin metada LV).
  • + +
  • Combine the two LVs into a thin pool with + lvconvert. Run lvs -a and explain the flags + listed below Attr.
  • + +
  • Create a 10G thin LV named oslv (over-subscribed + LV).
  • + +
  • Create an XFS file system on oslv and mount it on + /mnt.
  • + +
  • Run a loop of the form for ((i = 0; i < 50; i++)): do + ... ; done so that each iteration creates a 50M file named + file-$i and a snapshot named snap_oslv-$i + of oslv.
  • + +
  • Activate an arbitrary snapshot with lvchange -K and + try to mount it. Explain what the error message means. Then read the + "XFS on snapshots" section of lvmthin(7).
  • + +
  • Check the available space of the data LV with lvs + -a. Mount one snapshot (specifying -o nouuid) + and run lvs -a again. Why did the free space decrease + although no new files were written?
  • + +
  • Mount four different snapshots and check that they contain the + expected files.
  • + +
  • Remove all snapshots. Guess what lvs -a and dh + -h /mnt report. Then run the commands to confirm. Guess + what happens if you try to create another 3G file? Confirm + your guess, then read the section on "Data space exhaustion" of + lvmthin(7).
  • + +
+ +HOMEWORK(« + +When a thin pool provisions a new data block for a thin LV, the new +block is first overwritten with zeros by default. Discuss why this +is done, its impact on performance and security, and conclude whether +or not it is a good idea to turn off the zeroing. + +») + +SECTION(«Bcache, dm-cache and dm-writecache») + +

All three implementations named in the title of this chapter are +Linux block layer caches. They combine two different block +devices to form a hybrid block device which dynamically caches +and migrates data between the two devices with the aim to improve +performance. One device, the backing device, is expected +to be large and slow while the other one, the cache device, +is expected to be small and fast.

+ +
+define(«bch_width», «300») +define(«bch_height», «130») +define(«bch_margin», «10») +define(«bch_rraid_width», «eval((bch_width() - 4 * bch_margin()) * 4 / 5)») +define(«bch_raidbox_height», «eval(bch_height() - 2 * bch_margin())») +define(«bch_nraid_width», «eval(bch_rraid_width() / 4)») +define(«bch_rdisk_width», «eval((bch_width() - 3 * bch_margin()) * 18 / 100)») +define(«bch_rdisk_height», «eval((bch_height() - 4 * bch_margin()) / 3)») +define(«bch_ndisk_width», «eval(bch_rdisk_width() / 2)») +define(«bch_ndisk_height», «eval(bch_raidbox_height() - 5 * bch_margin())») +define(«bch_rdisk», «svg_disk(«$1», «$2», + «bch_rdisk_width()», «bch_rdisk_height()», «#666»)») +define(«bch_ndisk», «svg_disk(«$1», «$2», + «bch_ndisk_width()», «bch_ndisk_height()», «#66f»)») +define(«bch_5rdisk», « + bch_rdisk(«$1», «$2») + bch_rdisk(«eval($1 + bch_margin())», + «eval($2 + bch_margin())») + bch_rdisk(«eval($1 + 2 * bch_margin())», + «eval($2 + 2 * bch_margin())») + bch_rdisk(«eval($1 + 3 * bch_margin())», + «eval($2 + 3 * bch_margin())») + bch_rdisk(«eval($1 + 4 * bch_margin())», + «eval($2 + 4 * bch_margin())») + +») +define(«bch_rraid», « + + bch_5rdisk(«eval($1 + bch_margin())», + «eval($2 + 2 * bch_margin())») + bch_5rdisk(«eval($1 + 2 * bch_rdisk_width() + bch_margin())», + «eval($2 + 2 * bch_margin())») +») +define(«bch_nraid», « + + bch_ndisk(eval($1 + bch_margin()), + eval($2 + 2 * bch_margin())) + bch_ndisk(eval($1 + 2 * bch_margin()), + eval($2 + 3 * bch_margin())) +») + + + + bch_nraid(«bch_margin()», «bch_margin()») + bch_rraid(«eval(2 * bch_margin() + bch_nraid_width())», «bch_margin()») + +
+ +

The most simple setup consists of a single rotating disk and one SSD. +The setup shown in the diagram at the left is realistic for a large +server with redundant storage. In this setup the hybrid device +(yellow) combines a raid6 array (green) consisting of many rotating +disks (grey) with a two-disk raid1 array (orange) stored on fast +NVMe devices (blue). In the simple setup it is always a win when +I/O is performed from/to the SSD instead of the rotating disk. In +the server setup, however, it depends on the workload which device +is faster. Given enough rotating disks and a streaming I/O workload, +the raid6 outperforms the raid1 because all disks can read or write +at full speed.

+ +

Since block layer caches hook into the Linux block API described earlier, the hybrid block devices +they provide can be used like any other block device. In particular, +the hybrid devices are file system agnostic, meaning that +any file system can be created on them. In what follows we briefly +describe the differences between the three block layer caches and +conclude with the pros and cons of each.

+ +

Bcache is a stand-alone stacking device driver which was +included in the Linux kernel in 2013. According to the bcache home page, it +is "done and stable". dm-cache and dm-writecache are device mapper +targets included in 2013 and 2018, respectively, which are both marked +as experimental. In contrast to dm-cache, dm-writecache only caches +writes while reads are supposed to be cached in RAM. It has been +designed for programs like databases which need low commit latency. +Both bcache and dm-cache can operate in writeback or writethrough +mode while dm-writecache always operates in writeback mode.

+ +

The DM-based caches are designed to leave the decision as to what +data to migrate (and when) to user space while bcache has this policy +built-in. However, at this point only the Stochastic Multiqueue + (smq) policy for dm-cache exists, plus a second policy which +is only useful for decommissioning the cache device. There are no +tunables for dm-cache while all the bells and whistles of bcache can +be configured through sysfs files. Another difference is that bcache +detects sequential I/O and separates it from random I/O so that large +streaming reads and writes bypass the cache and don't push cached +randomly accessed data out of the cache.

+ +

bcache is the clear winner of this comparison because it is stable, +configurable and performs better at least on the server setup +described above because it separate random and sequential I/O. The +only advantage of dm-cache is its flexibility because cache policies +can be switched. But even this remains a theoretical advantage as +long as only a single policy for dm-cache exists.

+ +EXERCISES() + +
    + +
  • Recall the concepts of writeback and writethrough and explain + why writeback is faster and writethrough is safer.
  • + +
  • Explain how the writearound mode of bcache works and + when it should be used.
  • + +
  • Setup a bcache device from two loop devices.
  • + +
  • Create a file system of a bcache device and mount it. Detach + the cache device while the file system is mounted.
  • + +
  • Setup a dm-cache device from two loop devices.
  • + +
  • Setup a thin pool where the data LV is a dm-cache device.
  • + +
  • Explain the point of dm-cache's passthrough mode.
  • + +
+ +HOMEWORK(« + +Explain why small writes to a file system which is stored on a +parity raid result in read-modify-write (RMW) updates. Explain why +RMW updates are particularly expensive and how raid implementations +and block layer caches try to avoid them. + +») + +HOMEWORK(« + +Recall the concepts of writeback and writethrough. Describe what +each mode means for a hardware device and for a bcache/dm-cache +device. Explain why writeback is faster and writethrough is safer. + +») + +HOMEWORK(« + +TRIM and UNMAP are special commands in the ATA/SCSI command sets +which inform an SSD that certain data blocks are no longer in use, +allowing the SSD to re-use these blocks to increase performance and +to reduce wear. Subsequent reads from the trimmed data blocks will +not return any meaningful data. For example, the mkfs +commands sends this command to discard all blocks of the device. +Discuss the implications when mkfs. is run on a device +provided by bcache or dm-cache. + +») + +SECTION(«The dm-crypt Target») + +

This device mapper target provides encryption of arbitrary block +devices by employing the primitives of the crypto API of the Linux +kernel. This API provides a uniform interface to a large number of +cipher algorithms which have been implemented with performance and +security in mind.

+ +

The cipher algorithm of choice for the encryption of block devices +is the Advanced Encryption Standard (AES), also known +as Rijndael, named after the two Belgian cryptographers +Rijmen and Daemen who proposed the algorithm in 1999. AES is a +symmetric block cipher. That is, a transformation which operates +on fixed-length blocks and which is determined by a single key for both +encryption and decryption. The underlying algorithm is fairly simple, +which makes AES perform well in both hardware and software. Also +the key setup time and the memory requirements are excellent. Modern +processors of all manufacturers include instructions to perform AES +operations in hardware, improving speed and security.

+ +

According to the Snowden documents, the NSA has been doing research +on breaking AES for a long time without being able to come up with +a practical attack for 256 bit keys. Successful attacks invariably +target the key management software instead, which is often implemented +poorly, trading security for user-friendliness, for example by +storing passwords weakly encrypted, or by providing a "feature" +which can decrypt the device without knowing the password.

+ +

The exercises of this section ask the reader to encrypt a loop device +with AES without relying on any third party key management software

. + +EXERCISES() +
    +
  • Discuss the message of this xkcd comic.
  • + +
  • How can a hardware implementation of an algorithm like AES + improve security? After all, it is the same algorithm that is + implemented.
  • + +
  • What's the point of the rstream.c + program below which writes random data to stdout? Doesn't + cat /dev/urandom do the same?
  • + +
  • Compile and run rstream.c to create + a 10G local file and create the loop device /dev/loop0 + from the file.
  • + +
  • A table for the dmsetup(8) command is + a single line of the form start_sector num_sectors target_type + target_args. Determine the correct values for the first three + arguments to encrypt /dev/loop0.
  • + +
  • The target_args for the dm-crypt target are + of the form cipher key iv_offset device offset. To + encrypt /dev/loop0 with AES-256, cipher + is aes, device is /dev/loop0 and both + offsets are zero. Come up with an idea to create a 256 bit key from + a passphrase.
  • + +
  • The create subcommand of dmsetup(8) + creates a device from the given table. Run a command of + the form echo "$table" | dmsetup create cryptdev + to create the encrypted device /dev/mapper/cryptdev + from the loop device.
  • + +
  • Create a file system on /dev/mapper/cryptdev, + mount it and create the file passphrase containing + the string "super-secret" on this file system.
  • + +
  • Unmount the cryptdev device and run dmsetup + remove cryptdev. Run strings on the loop device + and on the underlying file to see if it contains the string + super-secret" or passphrase.
  • + +
  • Re-create the cryptdev device, but this time use + a different (hence invalid) key. Guess what happens and confirm.
  • + +
  • Write a script which disables echoing (stty -echo), + reads a passphrase from stdin and combines the above steps to create + and mount an encrypted device.
  • + +
+ +HOMEWORK(« + +Why is it a good idea to overwrite a block device with random data +before it is encrypted? + +») + +HOMEWORK(« + +The dm-crypt target encrypts whole block devices. An alternative is +to encrypt on the file system level. That is, each file is encrypted +separately. Discuss the pros and cons of both approaches. + +») + +SUPPLEMENTS() + +SUBSECTION(«Random stream») + +
+	
+		/* Link with -lcrypto */
+		#include <openssl/rand.h>
+		#include <stdio.h>
+		#include <unistd.h>
+		#include <stdio.h>
+
+		int main(int argc, char **argv)
+		{
+			unsigned char buf[1024 * 1024];
+
+			for (;;) {
+				int ret = RAND_bytes(buf, sizeof(buf));
+
+				if (ret <= 0) {
+					fprintf(stderr, "RAND_bytes() error\n");
+					exit(EXIT_FAILURE);
+				}
+				ret = write(STDOUT_FILENO, buf, sizeof(buf));
+				if (ret < 0) {
+					perror("write");
+					exit(EXIT_FAILURE);
+				}
+			}
+			return 0;
+		}
+	
+
diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..d210e2b --- /dev/null +++ b/Makefile @@ -0,0 +1,64 @@ +I := include +B := build + +MACROS := $(I)/m4/aple.m4 +M4_ARGS := $(MACROS) +ifeq ($(findstring pub, $(MAKECMDGOALS)),) + m4 := $(wildcard *.m4) + M4_ARGS += -D PUBLIC=false + DEST := $(B)/internal +else + m4 := Introduction.m4 Unix_Concepts.m4 Networking.m4 LVM.m4 \ + Filesystems.m4 OS-Level_Virtualization.m4 + M4_ARGS += -D PUBLIC=true + DEST := $(B)/public +endif + +CSS := aple.css + +html := $(m4:.m4=.html) +imgs := $(wildcard $(I)/imgs/*.svg) +files := $(html) $(CSS) index.html $(notdir $(imgs)) aple.ico +all: $(addprefix $(DEST)/, $(files)) +pub: all + +LN_CMD = ln -f $< $@ + +$(DEST): + mkdir -p $@ + +# m4 -> html + +MD_CMD = m4 $(M4_ARGS) $< | markdown -f tables,links > $@ + +$(DEST)/Bash.html: Bash.m4 $(MACROS) | $(DEST) + $(MD_CMD) +$(DEST)/Command_Line_Utilities.html: Command_Line_Utilities.m4 $(MACROS) | $(DEST) + $(MD_CMD) +$(DEST)/Git.html: Git.m4 $(MACROS) | $(DEST) + $(MD_CMD) +$(DEST)/Gridengine.html: Gridengine.m4 $(MACROS) | $(DEST) + $(MD_CMD) + +$(DEST)/%.html: %.m4 $(MACROS) | $(DEST) + m4 $(M4_ARGS) $< > $@ + +# svg +$(DEST)/%.svg: $(I)/imgs/%.svg | $(DEST) + $(LN_CMD) + +$(DEST)/%.ico: $(I)/imgs/%.svg | $(DEST) + convert -level 0%,100%,0.1 $< $@ + +# style sheet +$(DEST)/$(CSS): $(I)/css/$(CSS) | $(DEST) + $(LN_CMD) + +# index.html +$(DEST)/index.html: $(DEST)/Introduction.html + $(LN_CMD) + +.PHONY: clean +clean: + rm -rf $(B) +-include Makefile.local diff --git a/Networking.m4 b/Networking.m4 new file mode 100644 index 0000000..b469442 --- /dev/null +++ b/Networking.m4 @@ -0,0 +1,629 @@ +TITLE(« + +Network down, IP packets delivered via UPS. -- BOFH excuse #427 + +», __file__) + +OVERVIEW(« + +Networking is a complex and diverse area of computer science. This page +can only scratch the surface of some essential networking concepts, +aiming to convey enough background knowledge to understand more +specific and thorough articles on the subject matter and to encourage +the reader to explore the vast freely available literature. We cover +the four layers of the TCP/IP interconnection model in some detail +and look at a small subset of networking tools, including SSH. The +chapter concludes with a short overview of the Linux-specific Netlink +subsystem. + +») + +SECTION(«Network Layers») + +
+define(«nl_width», «260») +define(«nl_height», «200») +define(«nl_box_width», «100») +define(«nl_text_offset», «110») +define(«nl_box_height», «eval((nl_height() - 10) / 5)») +define(«nl_layer_width», «eval(nl_box_width() / 4)») +define(«nl_font_size», «15») +dnl $1: layer (link/internet/transport/application) +dnl $2: box number (0-4), $3: row-span, $4: column-span, $5: color +define(«nl_box», « + +») +dnl $1: box number (see nl_box()), $2: text +define(«nl_text», « + + $2 + +») + + nl_box(«link», «0», «1», «#a22») + nl_box(«link», «1», «3», «#7e5») + nl_box(«link», «4», «1», «blue») + nl_box(«internet», «1», «1», «yellow») + nl_box(«internet», «2», «2», «#7e5») + nl_box(«transport», «2», «1», «orange») + nl_box(«transport», «3», «1», «#7e5») + nl_box(«application», «3», «1», «#7e5») + nl_text(«0», «Frame Header») + nl_text(«1», «IP Header») + nl_text(«2», «TCP/UDP Header») + nl_text(«3», «Data») + nl_text(«4», «Frame Footer») + +
+ +

The Open Systems Interconnection (OSI) model describes +network communication by subdividing the data flow into +abstraction layers. This model was published as an ISO standard +in 1984 and comprises seven independent layers. A similar model with +only four layers, known as the TCP/IP interconnection model, +was proposed in RFC 1122 (1989). The TCP/IP model does not consider +physical specifications, so it has no counterpart to the physical +layer of the OSI model. Moreover, the three top layers in the +OSI model are not distinguished in the TCP/IP model.

+ +

The four layers of the TCP/IP model (link, internet, +transport, and application) are illustrated in the +diagram on the left. The link layer receives the full ethernet frame +(left column). It reads and interprets the frame header (red) and +footer (blue), and regards the remaining part as data (green), to be +passed uninterpreted to the next layer. The internet layer (second +column) expects an IP packet and interprets the first part of the data +as the IP header (yellow). It hands off the rest as a TCP/UDP packet to +the transport layer (third column) which in turn reads and strips off +its header (orange). The application layer only sees the green part in +the fourth column. Each layer is discussed in a dedicated section.

+ +EXERCISES() + +
    +
  • RFC is short for Request for Comments. Make yourself + familiar with this concept.
  • + +
  • Search the web for "RFC 822" to get an idea how an RFC looks + like.
  • + +
  • Discuss the pros and cons of many abtraction layers.
  • +
+ +SECTION(«Link Layer») + +

The local network connection of a host is called its link. +The link layer is responsible for transmitting packets +between two hosts on the same link, that is, between directly connected +nodes. The link layer includes the protocols which maintain link states +such as the Address Resolution Protocol (ARP). Several + link types exist, the ubiquitous ethernet being +the only one to be discussed here. For ethernet links, the protocol +is specified in terms of the media access control (MAC) +addresses of ethernet frames.

+ +SUBSECTION(«Ethernet Bridging») + +

An ethernet bridge connects two or more networks by relaying +ethernet frames between the participating devices. This is described +in an official standard, the first revision of which was published in +1990. This standard can be implemented within a dedicated hardware +device, for example a network switch, or in software as +part of the operating system. Many soft- and hardware implementations +exist, which are compatible to each other as they all implement the +same protocol. Since ethernet bridges operate on the link layer, they +are transparent to higher level protocols like IP.

+ +

At the core of each bridge implementation there is the forwarding +database whose entries are indexed by the MAC addresses that have +recently been seen. Each time the bridge receives an ethernet frame, +the destination MAC address is looked up in the database to determine +the device to which the frame should be relayed. If no entry exists, +the frame is sent to all devices except the one it came +from, with the expectation that all devices but one will ignore the +frame. This is called flooding. From the source address +of the (single) reply a new database entry is created. This prevents +further flooding. Entries are removed from the database by aging: If +no frames have been received from a MAC address for the duration of a +time interval called aging time, the entry is removed from +the database.

+ +

The Linux ethernet bridge implementation dates back to 1999. Two +different tools are available to create and configure bridges: +brctl(8) and bridge(8). The exercises of this +section aim to get the reader started with both tools.

+ +SUBSECTION(«Virtual Ethernet Interfaces») + +

A bridge can accommodate physical devices like eth0 +as well as virtual devices. On Linux systems the common approach +to equip virtual machines with network interfaces employs the +virtual ethernet (veth) device driver. This driver provides +virtual pairs of devices where each pair represents an ethernet +tunnel. Ethernet frames received by one end appear on its pair. To +set up the network interface for a virtual machine, one end of the +pair is added to a bridge on the host system while the other end +represents the ethernet device of the virtual machine.

+ +EXERCISES() +
    +
  • Determine the MAC address of the first ethernet device of your + computer.
  • + +
  • How many times faster has ethernet become since its first protocol + version that supported a transfer rate of 2.94 Mbit/s in 1973?
  • + +
  • Explain why bridges can not be detected by tools like + traceroute(1) or tracepath(8).
  • + +
  • Hardware ethernet bridges are called switches because they + relay ethernet frames by using a technique called packet + switching. Define this term.
  • + +
  • The arp(8) command prints the kernel's network + neighbour cache. Explain the purpose of this cache and the difference + to the forwarding database of an ethernet bridge.
  • + +
  • Network devices can be set into promiscuous + mode. Explain what this means, why interfaces which belong + to an ethernet bridge need to be set into promiscuous mode, + and the consequences of this fact.
  • + +
  • On your local computer, shut down the eth0 + interface, create a bridge and add eth0 to the bridge. + Then configure the bridge device in the same way the physical interface + was configured before. Run brctl showmacs to see MAC + addresses and the aging timer.
  • + +
  • Create a virtual ethernet pair by running ip link add v1 + type veth peer name v2 and bring up the two interfaces with + ip link set up v1 and similar for v2. Add + the v1 end to the bridge. Configure an IP address on + the v2 end of the pair (ip addr add 192.168.42.42 + dev v2). Add an entry for the IP routing table with + ip route add 192.168.42.42/32 dev v2. Start the nc(1) + tool in listening mode and send IP traffic through the bridge + by starting nc(1) a second time to connect to + 192.168.42.42.
  • + +
+ +HOMEWORK(« +
    + +
  • Recall how the spanning tree algorithm works. Explain + how the spanning tree algorithm is employed in the spanning + tree protocol (STP). Name two reasons for activating STP in a + large network.
  • + +
  • The rapid spanning tree protocol (RSTP) is the + successor of the traditional STP. Explain the difference between the + two protocols.
  • + +
  • In each bridged network, there is one bridge which + plays a special role: the so-called root bridge. + Explain the purpose of the root bridge and how it is determined + among the bridges of the network.
  • + +
  • Linux offers two different tools to configure ethernet bridges: + brctl(8) and bridge(8). Compare the + feature sets of these tools.
  • + +
+») + + +SECTION(«Internet Layer») + +

These days the term "internet" has acquired a rather broad meaning +in that it refers to all kind of network services. However, in +the context of the TCP/IP interconnection model, the internet +layer is named aptly because its purpose is to send packets +across different networks, thereby enabling inter-networking. More +precisely, packets are routed from the source network to +the destination network, where both networks are identified by +IP interface addresses. Although both the prevalent IPv4 and the +next-generation IPv6 variant are being deployed actively worldwide, +we shall only discuss IPv4 here.

+ +

The first part of each IP packet is the IP header, which is +usually 20 byte long. Besides the source and destination addresses, +it contains an 8 bit protocol number which refers to the data portion +of the packet.

+ +

IP only provides an unreliable datagram transmission +facility, which means that packets may be lost, arrive multiple times, +or out of order. Moreover, packets can be fragmented or defragmented.

+ +EXERCISES() + +
    +
  • How many different IPv4 addresses exist?
  • + +
  • Visit this page which + claims to show the IP address of your computer. Check if the result + was correct by running ip addr show. Run host + a.b.c.d for the IP addresses and discuss the result.
  • + +
  • What is the difference between the maximum transmission + unit (MTU) and the path MTU?
  • + +
  • Describe the purpose of the Internet Control Message Protocol + (ICMP) and its relationship to IP.
  • + +
  • Byte 9 of the header of an IP packet is the so-called + time to live (TTL) field, which is initially set to 64 by + the sender. Explain the purpose of this field.
  • + +
  • Explain the connection between the TTL and the Internet + Control Message Protocol (ICMP).
  • + +
  • What is a netmask in an IPv4 network? What is the purpose of the + netmask? Why is the network part of an IP address also called + routing prefix?
  • + +
  • On any host, run ifconfig and ip addr + show. Both commands print the netmask of each network, but + in different ways. Explain which part of the output of the + ip command contains the netmask.
  • + +
+ +HOMEWORK(« +Discuss the security implications of network services which are based +on MAC addresses or IP addresses alone. +», « +Both the IP address and the MAC address are trivial to fake. So they +should never be used to authenticate a user or a device on a network +to which potential attackers have physical access, i.e., untrusted +devices can be connected. +») + +HOMEWORK(« + +Illustrate how network address translation (NAT) works +on the basis of a web search initiated from a desktop computer in a +local network and discuss the implications that NAT has on privacy. + +», « + +

The desktop is configured to route packets which are not destined +for the local network through a dedicated machine, called the +router. In particular, all internet traffic is sent to the router. +The router has two IP addresses: one address in the local network +and a public NAT address. As traffic passes from the desktop through +the router to the web server in the internet, the source address of +each IP packet (the local address of the desktop) is changed on the +fly to the public NAT address of the router. The router tracks each +active connection. When a reply arrives at the router, it uses the +connection tracking data stored during the outbound phase to determine +the address in the local network to which to forward the reply. This +time it overwrites the destination address of the IP packet with the +local address of the desktop.

+ +

NAT can be seen as providing a kind of privacy mechanism because +machines on the internet cannot monitor which hosts are sending and +receiving traffic. They only see the NAT address. NAT has also +downsides though: Pinpointing the source of a problem becomes harder, +and encryption becomes more difficult. For example you can not encrypt +the IP address because the router must be able to change it.

+ +») + +HOMEWORK(« +Run tracepath wikipedia.org. Explain how this command +works and how it can be used to identify networking problems. +») + +SECTION(«Transport Layer») + +

The protocols of the transport layer provide message transfer services +which are on one hand independent of the underlying network type, +and on the other hand independent of the application. Different +network services on running on the same host are distinguished by + port numbers, which are 16 bit identifiers. Several well +known port numbers are are associated with specific applications. +The two dominant transport layer protocols on top of IP, TCP and UDP, +are discussed in the following subsections.

+ +SUBSECTION(«The User Datagram Protocol») + +

The User Datagram Protocol (UDP) is the simplest +transport-layer protocol, built as a thin layer on top of IP. For this +reason, it offers only the same best-effort service as IP itself. For +example, there is no detection of duplicate or reordered packets, +no protection against packet loss or network congestion. However, +UDP generates checksums to catch transmission errors. Being a +connectionless protocol, only minimal internal state about the +connection is maintained. This makes UDP suitable for applications +which need to avoid the overhead of setting up a TCP connection, or +in situations where on-time arrival is more important than reliability.

+ +SUBSECTION(«The Transmission Control Protocol») + +

The Transmission Control Protocol (TCP) provides reliable, +ordered delivery of a stream and a classic window-based congestion +control. In contrast to UDP, TCP provides a stream which is independent +of any packet boundaries. TCP is used extensively by many applications. +Besides HTTP (the Hypertext Transfer Protocol), also FTP (the File +Transfer protocol), SMTP (Simple Mail Transfer Protocol), SSH (Secure +Shell) all sit on top of TCP.

+ +EXERCISES() + +
    +
  • Check /etc/services and find the TCP port + numbers for http (web), ssh and smtp (email). + +
  • Run ls /proc/sys/net/ipv4/udp* and ls + /proc/sys/net/ipv4/tcp* to see the available UDP and TCP + parameter settings, and to compare the complexity of UDP and TCP.
  • + +
  • Run netstat -pant to see active TCP sockets.
  • + +
  • Run nmap localhost to determine the listening + TCP sockets.
  • + +
  • Run netcat in TCP listening mode: nc -l $((19853 + + UID))). Invite your neighbour to chat with you by connecting + to your netcat process.
  • + +
  • Read section 3.1 of RFC 793 (Transmission Control Protocol, + 1981) to get an idea of the format of a TCP header, particularly the + control bits called ACK, RST, SYN and FIN.
  • + +
  • The name "SYN" for one of the control bits of the TCP header + stands for synchronize. What is being synchronized when + this bit is set? And why does it need to be synchronous in the first + place?
  • + +
  • Make yourself familiar with the 3-way TCP handshake also described + in RFC 793. Why is it called a 3-way handshake?
  • + +
  • Run ip tcpmetrics to see recent TCP peers and + discuss the output.
  • +
+ +HOMEWORK(« +
    + +
  • UDP is said to be datagram-oriented while TCP is + stream-oriented. Define both terms and explain the implications + for application writers.
  • + +
  • Explain how TCP achieves its goal of controlling the transmission + speed.
  • + +
  • Explain how the "SYN flooding" denial-of-service attack works and + how SYN cookies mitigate this attack.
  • + +
  • Explain the idea behind TFO (TCP fast open).
  • + +
  • In contrast to TCP, UDP is a connectionless protocol. In + particular, there is no handshake necessary to establish a + connection. Describe the pros and cons of this fact.
  • + +
  • Explain, in no more than two sentences, the idea behind a port + scan.
  • + +
  • What's a half-open TCP connection? What's a half-open TCP + port scan? Explain why half-open TCP port scans are reasonably + stealthy.
  • + +
  • Express your opinion on whether performing an unsolicited TCP + port scan should be considered a criminal act.
  • +
+») + +SECTION(«Application Layer») + +

Application layer protocols define how the server side of a network +service communicates with clients that connect to the server by +connecting a specific TCP or UDP port. Services are often associcated +with port numbers which can be registred at the Internet Assigned +Numbers Authority (IANA).

+ +

Examples for application layer protocols which are employed on top of +TCP are the Hypertext Transfer Protocol (HTTP, port 80) +and the Secure Shell Protocol (SSH, port 22). On top of +UDP sit the Domain Name System (DNS, port 53), the +Dynamic Host Configuration Protocol (DHCP, ports 67 and 68) +and the Network Time Protocol (NTP, port 123).

+ +

We won't discuss any specific application layer protocols here. Instead, +we look at some client programs.

+ +SUBSECTION(«The Name Service Switch») + +

Every Unix system needs a couple of (usually small) system databases +for proper operation. Besides the user database, there are other +databases for Unix group membership, the known hosts, network +protocols, and more. Traditionally, there was only a single source for +this information in the form of a configuration file per database, for +example /etc/hosts for the hosts database. The format of +each database file is described in the POSIX standard and in section +5 of the user manuals. This approach works well if the databases +and the number of hosts which need to share the same databases are +small. Larger organizations, however, have a need to maintain this +information centrally by means of some network service. The +Lightweight Directory Access Protocol (LDAP) and the +Domain Name System (DNS) are popular choices for the user +and the host/domain databases. Often the entries of the centralized +network database have to be merged with the entries of the local file +in /etc. This calls for a flexible method which lets the +administrator specify the sources of information and the search order. +Sun Microsystems came up with a clean solution to this problem named + Name Service Switch (NSS) for the Solaris operating system. +This solution was ported to most other Unix operating systems. The +implementation used on GNU/Linux systems is part of the GNU +C Library (glibc). The central configuration file for NSS is + /etc/nsswitch.conf.

+ +SUBSECTION(«Advanced SSH Features») + +

SSH, the secure shell, is a popular client/server software +package for logging into a remote machine. The name is a little +misleading, though. For one, SSH is not a shell; it merely provides +a method to run a shell. Second, it can do much more than +just log in and start the shell. It features a secure encrypted +communication channel between two hosts, and this channel can be +utilized in interesting ways on both ends. In the exercises we look +at TCP port forwarding, some useful configuration options, and public +key authorization.

+ +EXERCISES() + +
    + +
  • Inspect /etc/resolv.conf to get the IP address + of your nameserver(s). Then run dig @$IP $DOMAIN MX + where $IP is the nameserver IP address, and + $DOMAIN is the domain of your email adress, e.g. + tuebingen,mpg.de. Determine the hostname of the mail server + from the output and run nc $MAILHOST 25 to send a mail + to yourself. Hint: HELO $MAILHOST, mail from: + <$LOGNAME@$DOMAIN>, rcpt to: <$LOGNAME@DOMAIN>, + data.
  • + +
  • Edit /etc/passswd, /etc/shadow, + and /etc/group to manually create a user account. + Use the cryptout program below to + generate the second field of /etc/shadow containing + the encrypted password.
  • + +
  • Understand the hosts line in + /etc/nsswitch.conf.
  • + +
  • Does host $HOSTNAME always print the same IPv4 + address as ping $HOSTNAME?
  • + +
  • Run nc localhost 22 to determine the SSH server + version.
  • + +
  • Forward the TCP port 12345 of your local machine to an + internal server using ssh's -L option for local port + forwarding. Check that you can log in with ssh -p 12345 + localhost.
  • + +
  • Search the ssh_config(5) man page for + NoHostAuthenticationForLocalhost and ponder if it is a good + idea to set this to yes.
  • + +
  • Add Host and Hostname entries + to your ssh config file so that you can log in with ssh + hostname, even though hostname does not resolve + on your local network.
  • + +
  • Create an ssh key pair and add the public part so that you can + log in without specifying a password. Discuss the security implications + of this setup.
  • + +
  • In an ssh session, type ~C to open the ssh command + prompt. Use a suitable -L command to add a local port + forward to the existing connection. Type ~? to see + the available escape sequences.
  • + +
  • Add the lines Host *.eb.local and ProxyJump + cgw.tuebingen.mpg.de to your ssh config file. Then type + ssh olt.eb.local. Check ssh(1) to learn how + this works.
  • + +
+ +HOMEWORK(« +Explain the difference between local and remote port forwarding. Give +a typical example for either type of forwarding. +») + +SECTION(«The Netlink Messaging System») + +

The various layers and protocols discussed earlier in this chapter +dealt with the communication between hosts which are connected by +a network. The Linux-specific Netlink Interface, however, +does not fit into this picture because it is a messaging system +for passing network-related information between the kernel and a +user space program, and vice-versa. Among other uses, tools like +ip(8) and ifconfig(8) employ Netlink +to configure network devices. Netlink is implemented on top of the +socket infrastructure, so the communication link between a user space +program and the kernel is estabished by means of the usual system calls +socket(2), bind(2), connect(2), +and messages are transferred by calling sendmsg(2) +and recvmsg(2).

+ +

There are several netlink families which select the +kernel subsystem to communicate with. We shall only be concerned +with the NETLINK_ROUTE family, which is used to +modify network routes, IP addresses, and more. The details of +NETLINK_ROUTE are described in rtnetlink(7) +while netlink(7) covers the general interface and the +currently assigned families.

+ +

A Netlink message starts with a 16 byte header as defined by +struct nlmsghdr. To report errors to userspace, Netlink +provides a message type that encapsulates an error header defined +by struct nlmsgerr. Both structures are declared in +in include/linux/netlink.h. Full Netlink messsages, +including the Netlink header are transferred. Therefore the user space +program has to implement a parser for both regular Netlink messages +and Netlink error messages, as well as a primitive for setting up +properly formatted Netlink messages to be sent to the kernel. Several +user space libraries aim to help the programmer with this repetetive +and error-prone task, the minimalistic Netlink library +(libmnl) being the most popular one.

+ +SUPPLEMENTS() + +SUBSECTION(«cryptout.c») + +
+	
+		#include <stdlib.h>
+		#include <crypt.h>
+		#include <stdio.h>
+		#include <sys/random.h>
+
+		static const char set[] =
+			"abcdefghijklmnopqrstuvwxyz"
+			"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+			"0123456789./";
+
+		int main(int argc, char **argv)
+		{
+			unsigned char rnd[2], salt[2], *result;
+
+			if (argc < 2)
+				exit(EXIT_FAILURE);
+			if (getrandom(rnd, 2, 0) < 0)
+				exit(EXIT_FAILURE);
+			salt[0] = set[rnd[0] & 63];
+			salt[1] = set[rnd[1] & 63];
+			result = crypt(argv[1], salt);
+			if (!result)
+				exit(EXIT_FAILURE);
+			printf("%s\n", result);
+			exit(EXIT_SUCCESS);
+		}
+	
+
diff --git a/OS-Level_Virtualization.m4 b/OS-Level_Virtualization.m4 new file mode 100644 index 0000000..f5137ea --- /dev/null +++ b/OS-Level_Virtualization.m4 @@ -0,0 +1,823 @@ +TITLE(« +Fools ignore complexity. Pragmatists suffer it. Some can avoid it. +Geniuses remove it. -- Perlis's Programming Proverb #58 (1982) +», __file__) + + +OVERVIEW(« + +In general, virtualization refers to the abstraction of computer +resources. This chapter is primarily concerned with server +virtualization, a concept which makes it possible to run +more than one operating system simultaneously and independently +of each other on a single physical computer. We first describe +the different virtualization frameworks but quickly specialize on +Linux OS-level virtualization and their virtual machines called +containers. Container platforms for Linux are built on top of +namespaces and control groups, the low-level kernel +features which implement abstraction and isolation of processes. We +look at both concepts in some detail. One of the earliest container +platforms for Linux is LXC (Linux containers) which is +discussed in a dedicated section. + +») + +SECTION(«Virtualization Frameworks») + +The origins of server virtualization date back to the 1960s. The +first virtual machine was created as a collaboration between IBM +(International Business Machines) and the MIT (Massachusetts Institute +of Technology). Since then, many different approaches have been +designed, resulting in several Virtualization Frameworks. All +frameworks promise to improve resource utilization and availability, to +reduce costs, and to provide greater flexibility. While some of these +benefits might be real, they do not come for free. Their costs include: +the host becomes a single point of failure, decreased performance, +added complexity and increased maintenance costs due to extensive +debugging, documentation, and maintenance of the VMs. This chapter +briefly describes the three main virtualization frameworks. We list +the advantages and disadvantages of each and give some examples. + +SUBSECTION(«Software Virtualization (Emulation)») + +This virtualization framework does not play a significant role in +server virtualization, it is only included for completeness. Emulation +means to imitate a complete hardware architecture in software, +including peripheral devices. All CPU instructions and hardware +interrupts are interpreted by the emulator rather than being run by +native hardware. Since this approach has a large performance penalty, +it is only suitable when speed is not critical. For this reason, +emulation is typically employed for ancient hardware like arcade +game systems and home computers such as the Commodore 64. Despite +the performance penalty, emulation is valuable because it allows +applications and operating systems to run on the current platform as +they did in their original environment. + +Examples: Bochs, Mame, VICE. + +SUBSECTION(«Paravirtualization and Hardware-Assisted Virtualization») + +These virtualization frameworks are characterized by the presence +of a hypervisor, also known as Virtual Machine +Monitor, which translates system calls from the VMs to native +hardware requests. In contrast to Software Virtualization, the +host OS does not emulate hardware resources but offers a special +APIs to the VMs. If the presented interface is different to that +of the underlying hardware, the term paravirtualization +is used. The guest OS then has to be modified to include modified +(paravirtualized) drivers. In 2005 AMD and Intel added hardware +virtualization instructions to the CPUs and IOMMUs (Input/Output memory +management units) to the chipsets. This allowed VMs to directly execute +privileged instructions and use peripheral devices. This so-called +Hardware-Assisted Virtualization allows unmodified operating +systems to run on the VMs. + +The main advantage of Hardware-Assisted Virtualization is its +flexibility, as the host OS does not need to match the OS running on +the VMs. The disadvantages are hardware compatibility constraints and +performance loss. Although these days all hardware has virtualization +support, there are still significant differences in performance between +the host and the VM. Moreover, peripheral devices like storage hardware +has to be compatible with the chipset to make use of the IOMMU. + +Examples: KVM (with QEMU as hypervisor), Xen, UML + +SUBSECTION(«OS-level Virtualization (Containers)») + +OS-level Virtualization is a technique for lightweight virtualization. +The abstractions are built directly into the kernel and no +hypervisor is needed. In this context the term "virtual machine" is +inaccurate, which is why the OS-level VMs are called differently in +this context. On Linux, they are called containers, other +operating systems call them jails or zones. We +shall exclusively use "container" from now on. All containers share +a single kernel, so the OS running in the container has to match the +host OS. However, each container has its own root file system, so +containers can differ in user space. For example, different containers +can run different Linux distributions. Since programs running in a +container use the normal system call interface to communicate with +the kernel, OS-level Virtualization does not require hardware support +for efficient performance. In fact, OS-level Virtualization imposes +no overhead at all. + +OS-level Virtualization is superior to the alternatives because of its +simplicity and its performance. The only disadvantage is the lack of +flexibility. It is simply not an option if some of the VMs must run +different operating systems than the host. + +Examples: LXC, Singularity, Docker. + +EXERCISES() + +
    + +
  • On any Linux system, check if the processor supports virtualization + by running cat /proc/cpuinfo. Hint: svm and vmx.
  • + +
  • Hypervisors come in two flavors called native and + hosted. Explain the difference and the pros and cons of either + flavor. Is QEMU a native or a hosted hypervisor?
  • + +
  • Scan through chapter 15 (Secure Virtual Machine) of the + + AMD Programmer's Manual + + to get an idea of the complexity of Hardware-Assisted + Virtualization.
  • + +
+ +HOMEWORK(« + +
    +
  • Recall the concept of direct memory access (DMA) + and explain why DMA is a problem for virtualization. Which of the + three virtualization frameworks of this chapter are affected by this + problem?
  • + +
  • Compare AMD's Rapid Virtualization Indexing to Intel's + Extended Page Tables.
  • + +
  • Suppose a hacker gained root access to a VM and wishes to proceed + from there to get also full control over the host OS. Discuss the thread + model in the context of the three virtualization frameworks covered + in this section.
  • + +
+») + +SECTION(«Namespaces») + +Namespaces partition the set of processes into disjoint subsets +with local scope. Where the traditional Unix systems provided only +a single system-wide resource shared by all processes, the namespace +abstractions make it possible to give processes the illusion of living +in their own isolated instance. Linux implements the following +six different types of namespaces: mount (Linux-2.4.x, 2002), IPC +(Linux-2.6.19, 2006), UTS (Linux-2.6.19, 2006), PID (Linux-2.6.24, +2008), network (Linux-2.6.29, 2009), UID (Linux-3.8, 2013). +For OS-level virtualization all six name space types are typically +employed to make the containers look like independent systems. + +Before we look at each namespace type, we briefly describe how +namespaces are created and how information related to namespaces can +be obtained for a process. + +SUBSECTION(«Namespace API») + +

Initially, there is only a single namespace of each type called the + root namespace. All processes belong to this namespace. The + clone(2) system call is a generalization of the classic + fork(2) which allows privileged users to create new +namespaces by passing one or more of the six NEW_ +flags. The child process is made a member of the new namespace. Calling +plain fork(2) or clone(2) with no + NEW_* flag lets the newly created process inherit the +namespaces from its parent. There are two additional system calls, + setns(2) and unshare(2) which both +change the namespace(s) of the calling process without creating a +new process. For the latter, there is a user command, also called + unshare(1) which makes the namespace API available to +scripts.

+ +

The /proc/$PID directory of each process contains a + ns subdirectory which contains one file per namespace +type. The inode number of this file is the namespace ID. +Hence, by running stat(1) one can tell whether +two different processes belong to the same namespace. Normally a +namespace ceases to exist when the last process in the namespace +terminates. However, by opening /proc/$PID/ns/$TYPE +one can prevent the namespace from disappearing.

+ +SUBSECTION(«UTS Namespaces») + +UTS is short for UNIX Time-sharing System. The old fashioned +word "Time-sharing" has been replaced by multitasking +but the old name lives on in the uname(2) system +call which fills out the fields of a struct utsname. +On return the nodename field of this structure +contains the hostname which was set by a previous call to +sethostname(2). Similarly, the domainname field +contains the string that was set with setdomainname(2). + +UTS namespaces provide isolation of these two system identifiers. That +is, processes in different UTS namespaces might see different host- and +domain names. Changing the host- or domainname affects only processes +which belong to the same UTS namespace as the process which called + sethostname(2) or setdomainname(2). + +SUBSECTION(«Mount Namespaces») + +The mount namespaces are the oldest Linux namespace +type. This is kind of natural since they are supposed to overcome +well-known limitations of the venerable chroot(2) +system call which was introduced in 1979. Mount namespaces isolate +the mount points seen by processes so that processes in different +mount namespaces can have different views of the file system hierarchy. + +Like for other namespace types, new mount namespaces are created by +calling clone(2) or unshare(2). The +new mount namespace starts out with a copy of the caller's mount +point list. However, with more than one mount namespace the +mount(2) and umount(2) system calls no longer +operate on a global set of mount points. Whether or not a mount +or unmount operation has an effect on processes in different mount +namespaces than the caller's is determined by the configurable +mount propagation rules. By default, modifications to the list +of mount points have only affect the processes which are in the same +mount namespace as the process which initiated the modification. This +setting is controlled by the propagation type of the +mount point. Besides the obvious private and shared types, there is +also the MS_SLAVE propagation type which lets mount +and unmount events propagate from from a "master" to its "slaves" +but not the other way round. + +SUBSECTION(«Network Namespaces») + +Network namespaces not only partition the set of processes, as all +six namespace types do, but also the set of network interfaces. That +is, each physical or virtual network interface belongs to one (and +only one) network namespace. Initially, all interfaces are in the +root network namespace. This can be changed with the command +ip link set iface netns PID. Processes only see interfaces +whose network namespace matches the one they belong to. This lets +processes in different network namespaces have different ideas about +which network devices exist. Each network namespace has its own IP +stack, IP routing table and TCP and UDP ports. This makes it possible +to start, for example, many sshd(8) processes which +all listen on "their own" TCP port 22. + +An OS-level virtualization framework typically leaves physical +interfaces in the root network namespace but creates a dedicated +network namespace and a virtual interface pair for each container. One +end of the pair is left in the root namespace while the other end is +configured to belong to the dedicated namespace, which contains all +processes of the container. + +SUBSECTION(«PID Namespaces») + +This namespace type allows a process to have more than one process +ID. Unlike network interfaces which disappear when they enter a +different network namespace, a process is still visible in the root +namespace after it has entered a different PID namespace. Besides its +existing PID it gets a second PID which is only valid inside the target +namespace. Similarly, when a new PID namespace is created by passing +the CLONE_NEWPID flag to clone(2), the +child process gets some unused PID in the original PID namepspace +but PID 1 in the new namespace. + +As as consequence, processes in different PID namespaces can have the +same PID. In particular, there can be arbitrary many "init" processes, +which all have PID 1. The usual rules for PID 1 apply within each PID +namespace. That is, orphaned processes are reparented to the init +process, and it is a fatal error if the init process terminates, +causing all processes in the namespace to terminate as well. PID +namespaces can be nested, but under normal circumstances they are +not. So we won't discuss nesting. + +Since each process in a non-root PID namespace has also a PID in the +root PID namespace, processes in the root PID namespace can "see" all +processes but not vice versa. Hence a process in the root namespace can +send signals to all processes while processes in the child namespace +can only send signals to processes in their own namespace. + +Processes can be moved from the root PID namespace into a child +PID namespace but not the other way round. Moreover, a process can +instruct the kernel to create subsequent child processes in a different +PID namespace. + +SUBSECTION(«User Namespaces») + +User namespaces have been implemented rather late compared to other +namespace types. The implementation was completed in 2013. The purpose +of user namespaces is to isolate user and group IDs. Initially there +is only one user namespace, the initial namespace to which +all processes belong. As with all namespace types, a new user namespace +is created with unshare(2) or clone(2). + +The UID and GID of a process can be different in different +namespaces. In particular, an unprivileged process may have UID +0 inside an user namespace. When a process is created in a new +namespace or an process joins an existing user namespace, it gains full +privileges in this namespace. However, the process has no additional +privileges in the parent/previous namespace. Moreover, a certain flag +is set for the process which prevents the process from entering yet +another namespace with elevated privileges. In particular it does not +keep its privileges when it returns to its original namespace. User +namespaces can be nested, but we don't discuss nesting here. + +Each user namespace has an owner, which is the effective user +ID (EUID) of the process which created the namespace. Any process +in the root user namespace whose EUID matches the owner ID has all +capabilities in the child namespace. + +If CLONE_NEWUSER is specified together with other + CLONE_NEW* flags in a single clone(2) +or unshare(2) call, the user namespace is guaranteed +to be created first, giving the child/caller privileges over the +remaining namespaces created by the call. + +It is possible to map UIDs and GIDs between namespaces. The +/proc/$PID/uid_map and /proc/$PID/gid_map files +are used to get and set the mappings. We will only talk about UID +mappings in the sequel because the mechanism for the GID mappings are +analogous. When the /proc/$PID/uid_map (pseudo-)file is +read, the contents are computed on the fly and depend on both the user +namespace to which process $PID belongs and the user +namespace of the calling process. Each line contains three numbers +which specify the mapping for a range of UIDs. The numbers have +to be interpreted in one of two ways, depending on whether the two +processes belong to the same user namespace or not. All system calls +which deal with UIDs transparently translate UIDs by consulting these +maps. A map for a newly created namespace is established by writing +UID-triples once to one uid_map +file. Subsequent writes will fail. + +SUBSECTION(«IPC Namespaces») + +System V inter process communication (IPC) subsumes three different +mechanisms which enable unrelated processes to communicate with each +other. These mechanisms, known as message queues, +semaphores and shared memory, predate Linux by at +least a decade. They are mandated by the POSIX standard, so every Unix +system has to implement the prescribed API. The common characteristic +of the System V IPC mechanisms is that their objects are addressed +by system-wide IPC identifiers rather than by pathnames. + +IPC namespaces isolate these resources so that processes in different +IPC namespaces have different views of the existing IPC identifiers. +When a new IPC namespace is created, it starts out with all three +identifier sets empty. Newly created IPC objects are only visible +for processes which belong to the same IPC namespace as the process +which created the object. + +EXERCISES() + +
    + +
  • Examine /proc/$$/mounts, + /proc/$$/mountinfo, and /proc/$$/mountstats. +
  • + +
  • Recall the concept of a bind mount. Describe the + sequence of mount operations a container implementation would need + to perform in order to set up a container whose root file system + is mounted on, say, /mnt before the container is + started.
  • + +
  • What should happen on the attempt to change a read-only mount + to be read-write from inside of a container?
  • +
  • Compile and run utc-ns.c, a minimal C + program which illustrates how to create a new UTS namespace. Explain + each line of the source code.
  • + +
  • Run ls -l /proc/$$/ns to see the namespaces of + the shell. Run stat -L /proc/$$/ns/uts and confirm + that the inode number coincides with the number shown in the target + of the link of the ls output. + +
  • Discuss why creating a namespace is a privileged operation.
  • + +
  • What is the parent process ID of the init process? Examine the + fourth field of /proc/1/stat to confirm.
  • + +
  • It is possible for a process in a PID namespace to have a parent + which is outside of this namespace. This is certainly the case for + the process with PID 1. Can this also happen for a different process? +
  • + +
  • Examine the pid-ns.c program. Will the + two numbers printed as PID and child PID + be the same? What will be the PPID number? Compile and run the program + to see if your guess was correct. + +
  • Create a veth socket pair. Check that both ends of the pair are + visible with ip link show. Start a second shell in a + different network namespace and confirm by running the same command + that no network interfaces exist in this namespace. In the original + namespace, set the namespace of one end of the pair to the process ID + of the second shell and confirm that the interface "moved" from one + namespace to the other. Configure (different) IP addresses on both ends + of the pair and transfer data through the ethernet tunnel between the + two shell processes which reside in different network namespaces.
  • + +
  • Loopback, bridge, ppp and wireless are network namespace + local devices, meaning that the namespace of such devices can + not be changed. Explain why. Run ethtool -k iface + to find out which devices are network namespace local.
  • + +
  • In a user namespace where the uid_map file has + not been written, system calls like setuid(2) which + change process UIDs fail. Why?
  • + +
  • What should happen if a set-user-ID program is executed inside + of a user namespace and the on-disk UID of the program is not a mapped + UID?
  • + +
  • Is it possible for a UID to map to different user names even if + no user namespaces are in use?
  • + +
+ +HOMEWORK(« +The shmctl(2) system call performs operations on a System V +shared memory segment. It operates on a shmid_ds structure +which contains in the shm_lpid field the PID of the process +which last attached or detached the segment. Describe the implications this API +detail has on the interaction between IPC and PID namespaces. +») + +SECTION(«Control Groups») + + Control groups (cgroups) allow processes to be grouped +and organized hierarchically in a tree. Each control group contains +processes which can be monitored or controlled as a unit, for example +by limiting the resources they can occupy. Several controllers + exist (CPU, memory, I/O, etc.), some of which actually impose +control while others only provide identification and relay control +to separate mechanisms. Unfortunately, control groups are not easy to +understand because the controllers are implemented in an inconsistent +way and because of the rather chaotic relationship between them. + +In 2014 it was decided to rework the cgroup subsystem of the Linux +kernel. To keep existing applications working, the original cgroup +implementation, now called cgroup-v1, was retained and a +second, incompatible, cgroup implementation was designed. Cgroup-v2 +aims to address the shortcomings of the first version, including its +inefficiency, inconsistency and the lack of interoperability among +controllers. The cgroup-v2 API was made official in 2016. Version 1 +continues to work even if both implementations are active. + +Both cgroup implementations provide a pseudo file system that +must be mounted in order to define and configure cgroups. The two +pseudo file systems may be mounted at the same time (on different +mountpoints). For both cgroup versions, the standard mkdir(2) + system call creates a new cgroup. To add a process to a cgroup +one must write its PID to one of the files in the pseudo file system. + +We will cover both cgroup versions because as of 2018-11 many +applications still rely on cgroup-v1 and cgroup-v2 still lacks some +of the functionality of cgroup-v1. However, we will not look at +all controllers. + +SUBSECTION(«CPU controllers») + +These controllers regulate the distribution of CPU cycles. The +cpuset controller of cgroup-v1 is the oldest cgroup controller, +it was implemented before the cgroups-v1 subsystem existed, which is +why it provides its own pseudo file system which is usually mounted at +/dev/cpuset. This file system is only kept for backwards +compability and is otherwise equivalent to the corresponding part of +the cgroup pseudo file system. The cpuset controller links subsets +of CPUs to cgroups so that the processes in a cgroup are confined to +run only on the CPUs of "their" subset. + +The CPU controller of cgroup-v2, which is simply called "cpu", works +differently. Instead of specifying the set of admissible CPUs for a +cgroup, one defines the ratio of CPU cycles for the cgroup. Work to +support CPU partitioning as the cpuset controller of cgroup-v1 is in +progress and expected to be ready in 2019. + +SUBSECTION(«Devices») + +The device controller of cgroup-v1 imposes mandatory access control +for device-special files. It tracks the open(2) and + mknod(2) system calls and enforces the restrictions +defined in the device access whitelist of the cgroup the +calling process belongs to. + +Processes in the root cgroup have full permissions. Other cgroups +inherit the device permissions from their parent. A child cgroup +never has more permission than its parent. + +Cgroup-v2 takes a completely different approach to device access +control. It is implemented on top of BPF, the Berkeley packet +filter. Hence this controller is not listed in the cgroup-v2 +pseudo file system. + +SUBSECTION(«Freezer») + +Both cgroup-v1 and cgroup-v2 implement a freezer controller, +which provides an ability to stop ("freeze") all processes in a +cgroup to free up resources for other tasks. The stopped processes can +be continued ("thawed") as a unit later. This is similar to sending +SIGSTOP/SIGCONT to all processes, but avoids some problems +with corner cases. The v2 version was added in 2019-07. It is available +from Linux-5.2 onwards. + +SUBSECTION(«Memory») + +Cgroup-v1 offers three controllers related to memory management. First +there is the cpusetcontroller described above which can be instructed +to let processes allocate only memory which is close to the CPUs +of the cpuset. This makes sense on NUMA (non-uniform memory access) +systems where the memory access time for a given CPU depends on the +memory location. Second, the hugetlb controller manages +distribution and usage of huge pages. Third, there is the + memory resource controller which provides a number of +files in the cgroup pseudo file system to limit process memory usage, +swap usage and the usage of memory by the kernel on behalf of the +process. The most important tunable of the memory resource controller +is limit_in_bytes. + +The cgroup-v2 version of the memory controller is rather more complex +because it attempts to limit direct and indirect memory usage of +the processes in a cgroup in a bullet-proof way. It is designed to +restrain even malicious processes which try to slow down or crash +the system by indirectly allocating memory. For example, a process +could try to create many threads or file descriptors which all cause a +(small) memory allocation in the kernel. Besides several tunables and +statistics, the memory controller provides the memory.events + file whose contents change whenever a state transition +for the cgroup occurs, for example when processes are started to get +throttled because the high memory boundary was exceeded. This file +could be monitored by a management agent to take appropriate +actions. The main mechanism to control the memory usage is the +memory.high file. + +SUBSECTION(«I/O») + +I/O controllers regulate the distribution of IO resources among +cgroups. The throttling policy of cgroup-v2 can be used to enforce I/O +rate limits on arbitrary block devices, for example on a logical volume +provided by the logical volume manager (LVM). Read and write bandwidth +may be throttled independently. Moreover, the number of IOPS (I/O +operations per second) may also be throttled. The I/O controller of +cgroup-v1 is called blkio while for cgroup-v2 it is simply +called io. The features of the v1 and v2 I/O controllers +are identical but the filenames of the pseudo files and the syntax +for setting I/O limits differ. The exercises ask the reader to try +out both versions. + +There is no cgroup-v2 controller for multi-queue schedulers so far. +However, there is the I/O Latency controller for cgroup-v2 +which works for arbitrary block devices and all I/O schedulers. It +features I/O workload protection for the processes in +a cgroup. This works by throttling the processes in cgroups that +have a lower latency target than those in the protected cgroup. The +throttling is performed by lowering the depth of the request queue +of the affected devices. + +EXERCISES() + +
    +
  • Run mount -t cgroup none /var/cgroup and + mount -t cgroup2 none /var/cgroup2 to mount both cgroup pseudo + file systems and explore the files they provide.
  • + +
  • Learn how to put the current shell into a new cgroup. + Hints: For v1, start with echo 0 > cpuset.mems && echo 0 > + cpuset.cpus. For v2: First activate controllers for the cgroup + in the parent directory.
  • + +
  • Set up the cpuset controller so that your shell process has only + access to a single CPU core. Test that the limitation is enforced by + running stress -c 2.
  • + +
  • Repeat the above for the cgroup-v2 CPU controller. Hint: + echo 1000000 1000000 > cpu.max.
  • + +
  • In a cgroup with one bash process, start a simple loop that prints + some output: while :; do date; sleep 1; done. Freeze + and unfreeze the cgroup by writing the string FROZEN + to a suitable freezer.state file in the cgroup-v1 file + system. Then unfreeze the cgroup by writing THAWED + to the same file. Find out how one can tell whether a given cgroup + is frozen.
  • + +
  • Pick a block device to throttle. Estimate its maximal read + bandwidth by running a command like ddrescue /dev/sdX + /dev/null. Enforce a read bandwidth rate of 1M/s for the + device by writing a string of the form "$MAJOR:$MINOR $((1024 * + 1024))" to a file named blkio.throttle.read_bps_device + in the cgroup-v1 pseudo file system. Check that the bandwidth + was indeed throttled by running the above ddrescue + command again.
  • + +
  • Repeat the previous exercise, but this time use the cgroup-v2 + interface for the I/O controller. Hint: write a string of the form + $MAJOR:MINOR rbps=$((1024 * 1024))" to a file named + io.max.
  • + +
+ +HOMEWORK(« +
    + +
  • In one terminal running bash, start a second + bash process and print its PID with echo $$. + Guess what happens if you run kill -STOP $PID; kill -CONT + $PID from a second terminal, where $PID + is the PID that was printed in the first terminal. Try it out, + explain the observed behaviour and discuss its impact on the freezer + controller. Repeat the experiment but this time use the freezer + controller to stop and restart the bash process.
  • +
+ +») + +SECTION(«Linux Containers (LXC)») + +Containers provide resource management through control groups and +resource isolation through namespaces. A container platform +is thus a software layer implemented on top of these features. Given a +directory containing a Linux root file system, starting the container +is a simple matter: First clone(2) is called with the +proper NEW_* flags to create a new process in a suitable +set of namespaces. The child process then creates a cgroup for the +container and puts itself into it. The final step is to let the child +process hand over control to the container's /sbin/init +by calling exec(2). When the last process in the newly +created namespaces exits, the namespaces disappear and the parent +process removes the cgroup. The details are a bit more complicated, +but the above covers the essence of what the container startup command +has to do. + +Many container platforms offer additional features not to be discussed +here, like downloading and unpacking a file system image from the +internet, or supplying the root file system for the container by other +means, for example by creating an LVM snapshot of a master image. +LXC is a comparably simple container platform which can be used to +start a single daemon in a container, or to boot a container from +a root file system as described above. It provides several +lxc-* commands to start, stop and maintain containers. +LXC version 1 is much simpler than subsequent versions, and is still +being maintained, so we only discuss this version of LXC here. + +An LXC container is defined by a configuration file in +the format described in lxc.conf(5). A minimal configuration which +defines a network device and requests CPU and memory isolation has +as few as 10 lines (not counting comments). With the configuration +file and the root file system in place, the container can be started +by running lxc-start -n $NAME. One can log in to the +container on the local pseudo terminal or via ssh (provided the sshd +package is installed). The container can be stopped by executing + halt from within the container, or by running +lxc-stop on the host system. lxc-ls and + lxc-info print information about containers, and +lxc-cgroup changes the settings of the cgroup associated with +a container. + +The exercises ask the reader to install the LXC package from source, +and to set up a minimal container running Ubuntu-18.04. + +EXERCISES() + +
    + +
  • Clone the LXC git repository from + https://github.com/lxc/lxc, check out the stable-1.0 + tag. Compile the source code with ./autogen.sh + and ./configure && make. Install with sudo make + install.
  • + +
  • Download a minimal Ubuntu root file system with a command like + debootstrap --download-only --include isc-dhcp-client bionic + /media/lxc/buru/ http://de.archive.ubuntu.com/ubuntu.
  • + +
  • Set up an ethernet bridge as described in the Link Layer section of the + chapter on networking.
  • + +
  • Examine the minimal + configuration file for the container and copy it to + /var/lib/lxc/buru/config. Adjust host name, MAC address and + the name of the bridge interface.
  • + +
  • Start the container with lxc-start -n buru.
  • + +
  • While the container is running, investigate the control files of the + cgroup pseudo file system. Identify the pseudo files which describe the + CPU and memory limit.
  • + +
  • Come up with a suitable lxc-cgroup command + to change the cpuset and the memory of the container while it is + running.
  • + +
  • On the host system, create a loop device and a file system on + it. Mount the file system on a subdirectory of the root file system + of the container. Note that the mount is not visible from within the + container. Come up with a way to make it visible without restarting + the container.
  • + +
+ +HOMEWORK(«Compare the features of LXC versions 1, 2 and 3.») + +SUPPLEMENTS() + +SUBSECTION(«UTS Namespace Example») +
+	
+		#define _GNU_SOURCE
+		#include <sys/utsname.h>
+		#include <sched.h>
+		#include <stdio.h>
+		#include <stdlib.h>
+		#include <unistd.h>
+
+		static void print_hostname_and_exit(const char *pfx)
+		{
+			struct utsname uts;
+
+			uname(&uts);
+			printf("%s: %s\n", pfx, uts.nodename);
+			exit(EXIT_SUCCESS);
+		}
+
+		static int child(void *arg)
+		{
+			sethostname("jesus", 5);
+			print_hostname_and_exit("child");
+		}
+
+		#define STACK_SIZE (64 * 1024)
+		static char child_stack[STACK_SIZE];
+
+		int main(int argc, char *argv[])
+		{
+			clone(child, child_stack + STACK_SIZE, CLONE_NEWUTS, NULL);
+			print_hostname_and_exit("parent");
+		}
+	
+
+ +SUBSECTION(«PID Namespace Example») +
+	
+		#define _GNU_SOURCE
+		#include <sched.h>
+		#include <unistd.h>
+		#include <stdlib.h>
+		#include <stdio.h>
+
+		static int child(void *arg)
+		{
+			printf("PID: %d, PPID: %d\n", (int)getpid(), (int)getppid());
+		}
+
+		#define STACK_SIZE (64 * 1024)
+		static char child_stack[STACK_SIZE];
+
+		int main(int argc, char *argv[])
+		{
+			pid_t pid = clone(child, child_stack + STACK_SIZE, CLONE_NEWPID, NULL);
+			printf("child PID: %d\n", (int)pid);
+			exit(EXIT_SUCCESS);
+		}
+	
+
+ +SUBSECTION(«Minimal LXC Config File») +
+	
+		# Employ cgroups to limit the CPUs and the amount of memory the container is
+		# allowed to use.
+		lxc.cgroup.cpuset.cpus = 0-1
+		lxc.cgroup.memory.limit_in_bytes = 2G
+
+		# So that the container starts out with a fresh UTS namespace that
+		# has already set its hostname.
+		lxc.utsname = buru
+
+		# LXC does not play ball if we don't set the type of the network device.
+		# It will always be veth.
+		lxc.network.type = veth
+
+		# This sets the name of the veth pair which is visible on the host. This
+		# way it is easy to tell which interface belongs to which container.
+		lxc.network.veth.pair = buru
+
+		# Of course we need to tell LXC where the root file system of the container
+		# is located. LXC will automatically mount a couple of pseudo file systems
+		# for the container, including /proc and /sys.
+		lxc.rootfs = /media/lxc/buru
+
+		# so that we can assign a fixed address via DHCP
+		lxc.network.hwaddr = ac:de:48:32:35:cf
+
+		# You must NOT have a link from /dev/kmsg pointing to /dev/console. In the host
+		# it should be a real device. In a container it must NOT exist. When /dev/kmsg
+		# points to /dev/console, systemd-journald reads from /dev/kmsg and then writes
+		# to /dev/console (which it then reads from /dev/kmsg and writes again to
+		# /dev/console ad infinitum). You've inadvertently created a messaging loop
+		# that's causing systemd-journald to go berserk on your CPU.
+		#
+		# Make sure to remove /var/lib/lxc/${container}/rootfs.dev/kmsg
+		lxc.kmsg = 0
+
+		lxc.network.link = br39
+
+		# This is needed for lxc-console
+		lxc.tty = 4
+	
+
+ +SECTION(«Further Reading») + diff --git a/Unix_Concepts.m4 b/Unix_Concepts.m4 new file mode 100644 index 0000000..ae0c4ec --- /dev/null +++ b/Unix_Concepts.m4 @@ -0,0 +1,2527 @@ +TITLE(« + + Unix is user-friendly. It's just very selective about who + its friends are. -- Unknown + +», __file__) + +SECTION(«History and Philosophy») + +SUBSECTION(«Early Unix History») + +

Unix was created in 1969 as a successor of Multics, the +MULTiplexed Information and Computing Service, which had been +in use since the mid 1960s as the successor of CTSS, the Compatible +Time-Sharing System of the early 1960s. Multics aimed to get +CTSS right, but failed in this regard and was eventually discontinued +because of its complexity. The Unix approach was very different as it +was brainstormed by only three people and then implemented by Ken +Thompson at Bell Laboratories in two days. Unlike its predecessors it +focused on elegance and simplicity. The name was originally spelt +UNICS (UNiplexed Information and Computing Service) to +emphasize the contrast to Multics.

+ +

The original Unix implementation was written in assembly +language for the 18 bit processor of the PDP-7 "minicomputer", a +device of the size of a wardrobe which was considered small by the +standards of the day. Like all computers of this era, the PDP-7 was +not connected to a video screen. Instead, input had to be typed in on +the console, a device which looked much like an electric +typewriter. Output from the computer was printed on rolls of paper. +Since the assembly instructions could not easily be ported to different +hardware, Dennis Ritchie invented the C programming language in +1971. By 1973 the complete Unix implementation had been rewritten in C. +The C language was another corner stone in the history of Unix which +turned out to be very successful. While other programming languages +of that time have long been abandoned or play merely a niche role, C +is still one of the most widely used programming languages today. The +first Unix application was roff, a typesetting program +which is still ubiquitous as the manual pages which ship with every +Unix system are formatted with roff.

+ +

From the beginning, Thompson and his early collaborators encouraged +close communication between programmers, creating an early form of +community. Up to the present day, this "hacker culture" has been a +stimulus for countless improvements. Copies of Unix were distributed +on tapes, hand-signed with "Love, Ken". Over time many universities +contributed to Unix. By the end of the 1970s, Unix accumulated a +whole bunch of utilities that made it a fully flavored operating +system which was also free of any copyright claims.

+ +

Despite the primitive hardware of the time, the early Unix was +remarkably similar to modern Linux systems. For example, the task +scheduler, the hierarchical filesystem tree and the shell already +existed back then.

+ +SUBSECTION(«Networking») + +

The Advanced Research Projects Agency (ARPA) was a +military research unit that was part of the USA's department of +defence. It was established in the early 1960s with the mandate to +create systems that could survive a nuclear war. The agency created +the arpanet, the predecessor of today's internet, which was +designed to stay operational after subordinate network losses. By +the end of the 1960s and the early 1970s, the fundamental networking +protocols were established: telnet for remote login was standardized +in 1969, email (SMTP) in 1971, and the file transfer protocol (FTP) +in 1973.

+ +

By the end of the 1970s many Unix installations existed in +all parts of the world. However, the arpanet was mostly powered by +commercial Multics systems because Unix only had rudimentary network +support (UUCP, the Unix to Unix copy) which could copy files +over telephone lines via modems but not much more. This changed in +1983 when TCP/IP networking was developed by the ARPA to replace the +arpanet. Unix support for TCP/IP was developed at Berkeley University +which had become the "Mecca" for Unix development and started already +in 1977 to release their own Unix system named BSD, the Berkeley +Software Distribution.

+ +SUBSECTION(«Commercialization, POSIX and GNU») + +

With excellent networking support and no licensing issues, it +was only a matter of time until companies became interested in Unix +in order to make money. Several companies started to commercialize +Unix by adding features to the common code base but keeping their +improvements closed, effectively stopping the source code from being +freely distributable. At the same time Microsoft began to sell their +DOS operating system, targeting small businesses and the home market. +DOS lacked many features that Unix already had for a decade, like +multi-tasking and multi-user support, but it did run on the cheap +Intel 286 processors that were too weak for Unix.

+ +

By 1985 the commercialization of Unix and the success of Microsoft +had damaged the Unix community badly. But also the various companies +that sold their particular proprietary Unix brand realized that +too many incompatible Unix implementations would only hurt their +business. That's where the Computer Society of the Institute of +Electrical and Electronics Engineers (IEEE) became involved +in Unix. The IEEE is an organization which was already founded in +1946 to advance the theory, practice, and application of computer +technology. This organization created POSIX, the Portable Operating +System Interface for Unix, which is a family of specifications +for maintaining compatibility between operating systems. The first +version of POSIX was published in 1988. It covered several command +line utilities including vi(1) and awk(1), +the shell scripting language, application programmer interfaces (APIs) +for I/O (input/output) and networking, and more. Up to the present +day POSIX is maintained by the IEEE and new revisions of the POSIX +standard are published regularly.

+ +

In 1983 Richard Stallman launched the GNU project and the Free +Software Foundation as a reaction to the ongoing commercialization +of Unix. GNU, which is is a recursive acronym for "GNU's not Unix", +aimed to keep the Unix source code free, or to replace non-free parts +by open source equivalents. To this aim the GNU project created +the GNU General Public License (GPL), which requires not +only the source code to stay free, but also that all subsequent +modifications to the code base remain free. By the end of the 80s, +the GNU toolset had become a full developer software stack licensed +under the GPL. This set of software packages was complemented by +the X window system, which was also released under a free +license and enabled programmers to build graphical applications for +desktop systems. Moreover, the first open source scripting language, +perl, was released in 1987.

+ +SUBSECTION(«Linux») + +

In 1985 Intel announced the 386 processor which, unlike its 286 +predecessor, was powerful enough to run Unix. There were efforts to +port the Unix operating system kernel to this hardware, but these +efforts were impaired by pending lawsuits about who owns the copyright +on the BSD source code. Due to the unclear legal situation of the BSD +code, the major missing piece in the GNU toolset was a free operating +system kernel. This hole was filled in 1991 when Linus Torvalds, +a student from Helsinki in Finland, announced the first version of +his Linux kernel.

+ +

Linux did not repeat the licensing problems of the original Unix +because the Linux source code was written from scratch and licensed +under the GPL. Due to this difference many developers moved from +Unix to Linux, so Linux grew quickly and started soon to outperform +the commercial Unix kernels in almost every benchmark. The cheap 386 +hardware, the Linux kernel, the GNU toolset and the graphical user +interface based on the X window system facilitated cheap workstations +which ran a complete open source software stack.

+ +

The success of Linux, or GNU/Linux as some prefer to +call it for reasons that should now be clear, has only increased +over time, to the point where commercial Unix systems are mostly +irrelevant. Today Linux runs on a wide variety of machines ranging +from supercomputers to workstations, smart phones and IOT (internet +of things) devices with very limited resources. + +

The same companies which almost killed Unix by commercializing it +in order to maximize their profit make money with Linux today. However, +they had to adjust their business model in order to comply with the +GPL. Rather than selling proprietary software, they bundle open source +software and sell support to paying customers. Some companies also +sell hardware with Linux pre-installed.

+ +SUBSECTION(«Linux Distributions») + +

A Linux Distribution is a conglomeration of free software, +including the Linux kernel, the GNU toolset and the X window system, +plus possibly other, proprietary software on top of that. Usually a +distribution also includes an installer and a package manager to +make it easy to install and update packages according to the users' +needs.

+ +

There are hundreds of Linux distributions, and new distributions +are created all the time while others are discontinued. Many +distributions are backed by companies which target specific +classes of users or hardware, but there are also non-commercial +Linux distributions which are solely driven by a community of +volunteers.

+ +

One of the most popular company-backed Linux distributions is +Ubuntu, which is led since 2004 by the UK-based Canonical Ltd. +It targets unskilled desktop users which would like to switch away +from Microsoft Windows. One reason for the popularity of Ubuntu +is that it is very easy to install on standard desktop and laptop +hardware. A distinguishing feature of Ubuntu is its strict release +cycles: New versions are released in April and October of each year, +and every fourth release is a long-term support (LTS) release +which will be supported for at least five years. Ubuntu also features +a variant for server hardware which contains a different Linux kernel +and ships with most desktop packages excluded.

+ +

The main community-driven Linux distribution is +Debian. The Debian project was founded in 1993 and the first +stable version was released in 1996. Debian is used as the basis for +many other distributions. In fact, Ubuntu is based on Debian. The +development of Debian closely follows the Unix culture in that it +is developed openly and distributed freely. A team of about 1000 +core developers work together with countless package maintainers +according to the Debian Social Contract, the Debian Constitution, +and the Debian Free Software Guidelines.

+ +EXERCISES() + +
    +
  • Run uname -a on various Unix machines to see the + OS type and the kernel version.
  • + +
  • Nice read on the + Origins and + History of Unix, 1969-1995.
  • + +
  • Explore the Unix + time line.
  • + +
  • Try out the Free Software + licensing quiz.
  • + +
  • Read the + notes on the 30th anniversary of the GNU Manifesto.
  • + +
  • Read the + Koan of Master Foo and the End User.
  • + +
  • On a Debian or Ubuntu system, run aptitude search + python to list all python-related Ubuntu packages. Run + aptitude show python-biopython to see the description + of the biopython package. Repeat with different search patterns and + packages.
  • + +
  • The Debian Social Contract (DSC) describes the agenda of Debian. + Find the DSC online, read it and form your own opinion about the key + points stated in this document.
  • +
+ +SECTION(«Characteristics of a Unix system») + +

After having briefly reviewed the history of Unix, we now look +closer at the various components which comprise a Unix system and +which distinguish Unix from other operating systems. We focus on +general design patterns that have existed since the early Unix days +and are still present on recent Linux systems.

+ +SUBSECTION(«Single Hierarchy of Files») + +

The most striking difference between Unix and Windows is perhaps +that on Unix the files of all devices are combined to form a single +hierarchy with no concept of drive letters. When the system boots, +there is only one device, the root device, which contains the +root directory. To make the files of other devices visible, +the mount operation must be employed. This operation attaches +the file hierarchy of the given device to the existing hierarchy +at a given location which is then called the mountpoint +of the device. Mountpoints are thus the locations in the hierarchy +where the underlying storage device changes.

+ +

The root directory contains a couple of well-known subdirectories, +each of which is supposed to contain files of a certain type or for +a certain purpose. The following table lists a subset: + +

    +
  • /bin: Essential commands for all users
  • +
  • /sbin: Essential system binaries
  • +
  • /lib: Essential libraries
  • +
  • /usr: Non-essential read-only user data
  • +
  • /etc: Static configuration files
  • +
  • /home: Home directories
  • +
  • /tmp: Temporary files
  • +
  • /run: Files which describe the state of running programs
  • +
  • /var: Log and spool files
  • +
+ +

The Filesystem Hierarchy Standard describes the various +subdirectories in more detail. The exercises ask the reader to become +acquainted with this directory structure.

+ +SUBSECTION(«POSIX Commands and Shell») + +

The Filesystem Hierarchy Standard lists /bin +and /sbin and several other directories for executable +files. The POSIX standard defines which executables must exist in one +of these directories for the system to be POSIX-compliant. Well over +100 POSIX commands are listed in the XCU volume of this +standard. Besides the names of the commands, the general behaviour +of each and the set of command line options and their semantics are +described. POSIX versions are designed with backwards compatibility +in mind. For example, a new POSIX version might require a command +to support additional command line options, but existing options are +never dropped and never change semantics in incompatible ways. The +target audience of the POSIX document are programmers who implement +and maintain the POSIX commands and users which want to keep their +software portable across different Unix flavors.

+ +

One of the POSIX commands is the shell, +/bin/sh, an interpreter that reads input expressed in +the shell command language, which is also part of POSIX. +The shell transforms the input in various ways to produce commands +and then executes these commands. The user may enter shell code +(i.e., code written in the shell command language) interactively at +the command prompt, or supply the input for the shell as a +shell script, a text file which contains shell code. Shell +scripts which only contain POSIX commands and use only POSIX options +are portable between different shell implementations and between +different Unix flavors. They should therefore never cease to work after +an upgrade. Among the many available POSIX shell implementations, +GNU bash is one of the more popular choices. Bash is fully +POSIX compatible and offers many more features on top of what is +required by POSIX.

+ +

Several implementations of the POSIX commands exist. On Linux +the GNU implementation is typically installed while FreeBSD, NetBSD +and MacOS contain the BSD versions. Although all implementations +are POSIX-compliant, they differ considerably because different +implementations support different sets of additional features and +options which are not required by POSIX. These extensions are not +portable, and should thus be avoided in shell scripts that must work +on different Unix flavors.

+ +

In addition to the POSIX commands, a typical Unix system might well +contain thousands of other commands. This illustrates another aspect +that is characteristic for Unix: Tools should do only one specific +task, and do it well. The operating system provides mechanisms +to combine the simple commands in order to form more powerful +programs. For example, commands can be chained together so +that the output of one command becomes the input for the next command +in the chain. This is the idea behind pipes, a Unix concept +which dates back to 1973 and which is also covered by POSIX. We shall +come back to pipes and related concepts in a later section.

+ +SUBSECTION(«Multi-User, Multi-Tasking, Isolation») + +

From the very beginning Unix was designed to be a multi-user +and a multi-tasking operating system. That is, it could run multiple +programs on behalf of different users independently of each other and +isolated from each other. This design was chosen to improve hardware +utilization and robustness. In contrast, DOS and early versions +of Windows were designed for personal computing (PC) and +had no notion of user accounts, access permissions or isolation. +This resulted in an unstable system because a single misbehaving +program was enough to take down the whole system. Therefore these +features had to be retrofitted later.

+ +

While multi-tasking makes all tasks appear to run simultaneously +even if there are more tasks than CPUs, isolation refers to +memory protection, a mechanism which prevents applications +from interfering with each other and with the internals of the +operating system. A running Unix system maintains two sets of +running tasks: besides the application tasks there is +also a set of kernel tasks. Unlike the application tasks, +the kernel tasks are privileged in that they can access the memory +of the application tasks while applications tasks can only access +their own memory. Isolation is achieved by a hardware concept called +protection domains, which existed already in Multics and thus +predates Unix. In the simplest case, there are only two protection +domains: a privileged domain called ring 0 for the kernel +tasks, and an unprivileged domain for application tasks, also called +user processes in this context. The CPU is always aware +of the current protection domain as this information is stored in a +special CPU register. + +SUBSECTION(«System Calls and the C POSIX Library») + +

Only when the CPU is running in the privileged ring 0 domain +(also known as kernel mode, as opposed to user mode +for application tasks), it can interact directly with hardware and +memory. If an application wants to access hardware, for example read a +data block from a storage device, it can not do so by itself. Instead, +it has to ask the operating system to perform the read operation +on behalf of the application. This is done by issuing a system +call. Like function calls, system calls interrupt the current +program, continue execution at a different address and eventually +return to the instruction right after the call. However, in addition +to this, they also cause the CPU to enter kernel mode so that it can +perform the privileged operation. When the system call has done its +work and is about to return to the application, the protection domain +is changed again to let the CPU re-enter user mode.

+ +

The system calls thus define the interface between applications +and the operating system. For backwards compatibility it is of utmost +importance that system calls never change semantics in an incompatible +way. Moreover, system calls must never be removed because this would +again break existing applications. The syntax and the semantics +of many system calls are specified in POSIX, although POSIX does +not distinguish between functions and system calls and refers to +both as system functions. This is because system calls are +typically not performed by the application directly. Instead, if an +application calls, for example, read(), it actually calls +the compatibility wrapper for the read system call which +is implemented as a function in the C POSIX Library (libc), +which ships with every Unix system. It is this library which does the +hard work, like figuring out which system calls are supported on the +currently running kernel, and how kernel mode must be entered on this +CPU type. Like the POSIX commands, the system functions described in +POSIX never change in incompatible ways, so programs which exclusively +use POSIX system functions are portable between different Unix flavors +and stay operational after an upgrade.

+ +SUBSECTION(«Multi-Layer Configuration Through Text Files») + +

On a multi-user system it becomes necessary to configure programs +according to each user's personal preferences. The Unix way to +achieve this is to provide four levels of configuration options +for each program. First, there are the built-in defaults which are +provided by the author of the program. Next, there is the system-wide +configuration that is controlled by the administrator. Third, +there is the user-defined configuration, and finally there are the +command line options. Each time the program is executed, the four +sets of configuration options are applied one after another so +that the later sets of options override the earlier settings. The +system-wide configuration is stored in /etc while the +user-defined configuration is stored in that user's home directory. +Both are are simple text files that can be examined and modified with +any text editor. This makes it easy to compare two configurations +and to transfer the configuration across different machines or user +accounts.

+ +SUBSECTION(«Everything is a File») + +

Another mantra which is often heard in connection with Unix is +everything is a file. This phrase, while certainly catchy, +is slightly incorrect. A more precise version would be everything +is controlled by a file descriptor, or, as Ritchie and Thompson +stated it, Unix has compatible file, device, and inter-process +I/O. Modern Unix systems have pushed this idea further and employ +file descriptors also for networking, process management, system +configuration, and for certain types of events. The file descriptor +concept is thus an abstraction which hides the differences between the +objects the file descriptors refer to. It provides a uniform interface +for the application programmer who does not need to care about whether +a file descriptor refers to a file, a network connection, a peripheral +device or something else because the basic I/O operations like open, +read, write are the same.

+ +

File descriptors are ubiquitous since every Unix program uses +them, albeit perhaps implicitly via higher-level interfaces provided +by a scripting language. We shall return to this topic when we discuss +processes.

+ +SUBSECTION(«Manual Pages») + +

All POSIX commands and most other programs are installed along +with one or more man pages (short for manual pages), +which are plain text files that can be formatted and displayed in +various ways. This concept was introduced in 1971 as part of the +Unix Programmer's Manual. The characteristic page layout +and the typical sections (NAME, SYNOPSIS, DESCRIPTION, EXAMPLES, +SEE ALSO) of a man page have not changed since then. The POSIX +man command is used to view man pages in a terminal. For +example, the command man ls opens the man page of the +ls command, and man man shows the man page +of the man command itself. Most implementations also +maintain a database of the existing man pages and provide additional +commands to query this database. For example, the whatis +command prints the one-line description of all man pages which match +a pattern while the apropos command searches the manual +page names and descriptions.

+ +

In addition to the man pages for commands, there are man pages for +system calls, library functions, configuration files and more. Each +man page belongs to one of several man sections. For example, +the aforementioned man pages for ls and man +are part of section 1 (user commands) while section 2 is reserved for +system calls and section 8 for administration commands that can only be +executed by privileged users. By convention, to indicate which section +a command or a function belongs to, the man section is appended in +parenthesis as in mount(8). Most Unix systems also offer +translated man pages for many languages as an optional package. Note +that the same name may refer to more than one man page. For example +there is kill(1) for the user command that kills processes +and also kill(2) which describes the corresponding system +call. To open the man page of a specific section, one may use a command +like man 2 kill. The MANSECT environment +variable can be set to a colon-delimited list of man sections to +change the order in which the man sections are searched.

+ +

Consulting the local man pages rather than searching the web has +some advantages. Most importantly, the local pages will always give +correct answers since they always match the installed software while +there is no such relationship between a particular web documentation +page and the version of the software package that is installed on the +local computer. Working with man pages is also faster, works offline +and helps the user to stay focused on the topic at hand.

+ +EXERCISES() + +
    +
  • Run df on as many systems as possible to see the + mount points of each filesystem. Then discuss the pros and cons of + a single file hierarchy as opposed to one hierarchy per device.
  • + +
  • Run ls / to list all top-level subdirectories of + the root file system and discuss the purpose of each. Consult the + Filesystem Hierarchy Standard if in doubt.
  • + +
  • Execute cd / && mc and start surfing at the root + directory.
  • + +
  • Compare the list of top-level directories that exist on different + Unix systems, for example Linux and MacOS.
  • + +
  • Find out which type of files are supposed to be stored in + /usr/local/bin. Run ls /usr/local/bin + to list this directory.
  • + +
  • Find out what the term bashism means and learn how to + avoid bashishms.
  • + +
  • Find the POSIX specification of the cp(1) command + online and compare the set of options with the options supported by + the GNU version of that command, as obtained with man cp + on a Linux system.
  • + +
  •   +
      +
    • Run time ls / and discuss the meaning of + the three time values shown at the end of the output (see + bash(1)).
    • + +
    • Guess the user/real and the sys/real ratios for the following + commands. Answer, before you run the commands. + +
        +
      • time head -c 100000000 /dev/urandom > /dev/null
      • + +
      • i=0; time while ((i++ < 1000000)); do :; done +
      • +
      +
    • + +
    • Run the above two commands again, this time run + htop(1) in parallel on another terminal and observe the + difference.
    • +
    +
  • + +
  • On a Linux system, check the list of all system calls in + syscalls(8).
  • + +
  • The strace(1) command prints the system calls that + the given command performs. Guess how many system calls the command + ls -l will make. Run strace -c ls -l for + the answer. Read the strace(1) man page to find suitable + command line options to only see the system calls which try to open + a file.
  • + +
  • Guess how many man pages a given system has. Run whatis -w + '*' | wc -l to see how close your guess was.
  • + +
  • Search the web for "cp(1) manual page" and count how many + different manual pages are shown in the first 20 hits.
  • +
+ +HOMEWORK(« + +Think about printers, sound cards, or displays as a file. Specifically, +describe what open, read, and write should +mean for these devices. + +», « + +Opening would establish a (probably exclusive) connection +to the device. Reading from the file descriptor returned by +open(2) could return all kinds of status information, +like the type, model and capabilities of the device. For example, +printers could return the number of paper trays, the amount of toner +left etc. Writing to the file descriptor would cause output on the +device. This would mean to print the text that is written, play the +audio samples, or show the given text on the display. The point to +take away is that the open, read, write interface is a +generic concept that works for different kinds of devices, not only +for storing data in a file on a hard disk. + +») + +SECTION(«Paths, Files and Directories») + +In this section we look in some detail at paths, at a matching +language for paths, and at the connection between paths and files. We +then describe the seven Unix file types and how file metadata are +stored. We conclude with the characteristics of soft and hard links. + +SUBSECTION(«Paths») + +

The path concept was introduced in the 1960s with the Multics +operating system. Paths will be familiar to the reader because +they are often specified as arguments to commands. Also many +system calls receive a path argument. A path is a non-empty +string of path components which are separated by slash +characters. An absolute path is a path that starts with a +slash, all other paths are called relative. A relative path +has to be interpreted within a context that implies the leading +part of the path. For example, if the implied leading part is +/foo/bar, the relative path baz/qux is +equivalent to the absolute path /foo/bar/baz/qux.

+ +

Given a path, there may or may not exist a file or a +directory that corresponds to the path. Path lookup is the +operation which determines the answer to this question, taking the +implied leading part into account in case of relative paths. This +operation is always performed within the kernel and turns out to +be surprisingly complex due to concurrency and performance issues. +Consult path_resolution(7) on a Linux system to learn +more about how pathnames are resolved to files.

+ +

If a path was successfully looked up, each path component up to the +second-last refers to an existing directory while the last component +refers to either a file or a directory. In both cases the directory +identified by the second-last component contains an entry named by the +last component. We call those paths valid. The valid paths +give rise to a rooted tree whose interior nodes are directories and +whose leaf nodes are files or directories. Note that the validity of a +path depends on the set of existing files, not just on the path itself, +and that a valid path may become invalid at any time, for example if +a file is deleted or renamed. Many system calls which receive a path +argument perform path lookup and fail with the No such file or +directory error if the lookup operation fails.

+ +

It depends on the underlying filesystem whether the path components +are case-sensitive or case-insensitive. That is, +whether paths which differ only in capitalization (for example +foo and Foo) refer to the same file. +Since the hierarchy of files may be comprised of several filesystems, +some components of the path may be case-sensitive while others are +case-insensitive. As a rule of thumb, Unix filesystems are case +sensitive while Microsoft filesystems are case-insensitive even when +mounted on a Unix system.

+ +

Path components may contain every character except the Null +character and the slash. In particular, space and newline characters +are allowed. However, while dots are allowed in path components if +they are used together with other characters, the path components +. and .. have a special meaning: every +directory contains two subdirectories named . and +.. which refer to the directory itself and its parent +directory, respectively.

+ +SUBSECTION(«Globbing») + +

Globbing, also known as pathname expansion, is a pattern +matching language for paths which was already present in the earliest +Unix versions. The glob operation generates a set of valid paths from +a glob pattern by replacing the pattern by all matching +paths.

+ +

Glob patterns may contain special characters called +wildcards. The wildcard characters are:

+ +
    +
  • *: match any string,
  • +
  • ?: match any simple character,
  • +
  • [...]: match any of the enclosed characters.
  • +
+ +

The complete syntax rules for glob patterns and the exact +semantics for pattern matching are described in POSIX and in +glob(7). Any POSIX-compliant shell performs globbing +to construct the command to be executed from the line entered at +the prompt. However, POSIX also demands system functions which make +globbing available to other applications. These are implemented as +part of libc.

+ + +

There are a few quirks related to globbing which are worth to +point out. First, if no valid path matches the given pattern, the +expansion of the pattern is, by definition according to POSIX, the +pattern itself. This can lead to unexpected results. Second, files +which start with a dot (so-called hidden files) must be +matched explicitly. For example, rm * does not +remove these files. Third, the tilde character is no wildcard, +although it is also expanded by the shell. See the exercises for more +examples.

+ +

POSIX globbing has some limitations. For example, there is no +glob pattern which matches exactly those files that start with an +arbitrary number of a characters. To overcome these +limitations, some shells extend the matching language by implementing +extended glob patterns which are not covered by POSIX. For +example, if extended globbing feature of bash(1) is +activated via the extglob option, the extended glob +pattern +(a) matches the above set of files.

+ +SUBSECTION(«File Types») + +We have seen that all but the last component of a valid path refer +to directories while the last component may refer to either a file +or a directory. The first character in the output of ls +-l indicates the type of the last path component: for +directories a d character is shown while files (also +called regular files in this context) get a hyphen character +(-). Besides directories and regular files, the following +special file types exist: + +
+
Soft link (l)
+ +
A file which acts as a pointer to another file. We shall cover + links in a dedicated subsection below.
+ +
Device node (c and b)
+ +
Also called device special. These files refer to devices + on the local system. Device nodes come in two flavors: character + devices (c) and block devices (b). Regardless + of the flavor, each device node has a major and a minor number + associated with it. The major number indicates the type of the + device (e.g. a hard drive, a serial connector, etc.) while the + minor number enumerates devices of the same type. On most systems + the device nodes are created and deleted on the fly as the set of + connected devices changes, for example due to a USB device being + added or removed. However, device nodes can also be created manually + with the mknod(1) command or the mknod(2) + system call. Device nodes do not necessarily correspond to physical + devices. In fact, POSIX demands the existence of a couple of + virtual devices with certain properties. We look at some of + these in the exercises. The access to device nodes which do correspond + to physical devices is usually restricted to privileged users.
+ +
Socket (s)
+ +
Sockets provide an interface between a running program and the + network stack of the kernel. They are subdivided into address + families which correspond to the various network protocols. For + example, the AF_INET and AF_INET6 address + families are for internet protocols (IP) while AF_LOCAL + (also known as AF_UNIX) is used for communication between + processes on the same machine. These local sockets are also called + Unix domain sockets. They can be bound to a path which + refers to a file of type socket. Regardless of the address family, + processes can exchange data via sockets in both directions, but + the local sockets support additional features, like passing process + credentials to other processes.
+ +
Fifo (p)
+ +
Files of type fifo are also known as named + pipes. They associate a path with a kernel object that provides a + First In, First Out data channel for user space programs. Data + written to the fifo by one program can be read back by another program + in the same order. Fifos are created with the mkfifo(1) + command or the mkfifo(3) library function.
+
+ +

Note that the type of a file is never inferred from the path. +In particular the suffix of the path (everything after the last +dot) is just a convention and has no strict connection to the file +type. Also there is no difference between text and binary files.

+ +SUBSECTION(«Metadata and Inodes») + +

The stat(2) system call returns the metadata +of the file or directory that corresponds to the given path. The +stat(1) command is a simple program which executes this +system call and prints the thusly obtained metadata in human-readable +form, including the file type. This is done without looking at the +contents of the file because metadata are stored in a special area +called the inode. All types of files (including directories) +have an associated inode. Besides the file type, the inode stores +several other properties prescribed by POSIX. For example the file +size, the owner and group IDs, and the access permissions are all +stored in the inode. Moreover, POSIX requires to maintain three +timestamps in each inode:

+ +
    +
  • modification time (mtime): time of last content change.
  • + +
  • access time (atime): time of last access.
  • + +
  • status change time (ctime): time of last modification to the + inode.
  • +
+ +

To illustrate the difference between the mtime and the ctime, +consider the chgrp(1) command which changes the group +ID of the file or directory identified by its path argument. This +command sets the ctime to the current time while the mtime is left +unmodified. On the other hand, commands which modify the contents of +a file, such as echo foo >> bar, change both the mtime +and the ctime.

+ +

The inode of each file or directory contains twelve mode +bits, nine of which are the permission bits which +control who is allowed to access the file or directory, and how. The +permission bits are broken up into three classes called user +(u), group (g) and others +(o). Some texts refer to the first and last class as +"owner" and "world" instead, but we won't use this naming to avoid +confusion. Each class contains three bits. The bits of the "user" +class apply to the file owner, that is, the user whose ID is stored in +the inode. The "group" category applies to all non-owners who belong +to the group whose ID is stored in the inode. The third category +applies to all remaining users. The three bits of each class refer to +read/write/execute permission. They are therefore named r, +w and x, respectively. The permission +bits mean different things for directories and non-directories, +as described below.

+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Directories Non-directories
 r  The permission to list the directory contents. More precisely, + this bit grants the permission to call opendir(3) + to obtain a handle to the directory which can then be passed to + readdir(3) to obtain the directory contents. If read permission is granted, the open(2) system + call does not fail with the permission denied error, + provided the file is opened in read-only mode. The system call may + fail for other reasons, though. + +
 w  The permission to add or remove directory entries. That is, + to create new files or to remove existing files. Note that write + permission is not required for the file that is being removed. Permission to open the file in write-only mode in order + to perform subsequent operations like write(2) + and truncate(2) which change the contents of the + file. Non-directories are often opened with the intention to both + read and write. Naturally, such opens require both read and write + permissions.
 x  The permission to search the directory. Searching + a directory means to access its entries, either by retrieving + inode information with stat(2) or by calling + open(2) on a directory entry. Run the file. This applies to binary executables as well + as to text files which start with a shebang, #!, + followed by the path to an interpreter. We shall cover file execution + in more detail below.
+ +

To run the regular file /foo/bar/baz, search +permission is needed for both foo and bar, +and execute permission is needed for baz. Similarly, to +open the regular file foo/bar for reading, we need execute +permissions on the current working directory and on foo, +and read permissions on bar.

+ +

A numeric permission mode is a three octal digit (0-7) +number, where the digits correspond to the user, group, other classes +described above, in that order. The value of each digit is derived by +adding up the bits with values 4 (read), 2 (write), and 1 (execute). +The following table lists all eight possibilities for each of the +three digits.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
octal value symbolic representation meaning
0 --- no permissions at all
1 --x only execute permission
2 -w- only write permission
3 -wx write and execute permission
4 r-- only read permission
5 r-x read and execute permission
6 rw- read and write permission
7 rwx read, write and execute permission
+ +

The chmod(1) command changes the permission +bits of the file identified by the path argument. For example, +chmod 600 foo sets the permissions of foo to +rw-------. Besides the octal values, chmod(1) +supports symbolic notation to address the three classes described +above: u selects the user class, g the +group class, o the class of other users. The symbolic +value a selects all three classes. Moreover, the letters +r, w and x are used to set or +unset the read, write and execute permission, respectively. The above +command is equivalent to chmod u=rw,g=---,o=--- foo. The ++ and - characters can be specified instead +of = to set or unset specific permission bits while +leaving the remaining bits unchanged. For example chmod go-rw +foo turns off read and write permissions for non-owners.

+ +

Unprivileged users can only change the mode bits of their own +files or directories while there is no such restriction for the +superuser.

+ +SUBSECTION(«Hard and Soft Links») + +

Links make it possible to refer to identical files through +different paths. They come in two flavors: hard and soft. Both +types of links have advantages and disadvantages, and different +limitations. We start with hard links because these existed already +in the earliest Unix versions.

+ +

A file can have more than one directory entry that points to its +inode. If two directory entries point to the same inode, they are +said to be hard links of each other. The two entries are +equivalent in that they refer to the same file. It is impossible +to tell which of the two is the "origin" from which the "link" +was created. Hard links are created with the link(2) +system call or the ln(1) command. Both take two path +arguments, one for the existing file and one for the directory entry +to be created. The filesystem maintains in each inode a link counter +which keeps track of the number of directory entries which point to the +inode. The link(2) system call increases the link count +while unlink(2) decrements the link count and removes +the directory entry. If the decremented counter remains positive, +there is still at least one other directory entry which points to +the inode. Hence the file is still accessible through this other +directory entry and the file contents must not be released. Otherwise, +when the link counter reached zero, the inode and the file contents +may be deleted (assuming the file is not in use).

+ +

There are several issues with hard links. For one, hard links +can not span filesystems. That is, the two path arguments for +link(2) have to refer to files which reside on the +same filesystem. Second, it is problematic to create hard links to +directories. Early Unix systems allowed this for the superuser, +but on Linux the attempt to hard-link a directory always fails. +To address the limitations of hard links, soft links, also +called symbolic links (or symlinks for short), +were introduced. A soft link can be imagined as a special text file +containing a single absolute or relative path, the target of +the link. For relative paths the implied leading part is the directory +that contains the link. A soft link is thus a named reference in +the global hierarchy of files. Unlike hard links, the soft link +and its target do not need to reside on the same filesystem, and +there is a clear distinction between the link and its target. Soft +links are created with symlink(2) or by specifying the +-s option to the ln(1) command.

+ +

A soft link and its target usually point to different inodes. This +raises the following question: Should system calls which receive a +path argument that happens to be a soft link operate on the link +itself, or should they follow (or dereference) +the link and perform the operation on the target? Most system calls +follow soft links, but some don't. For example, if the path argument +to chdir(2) happens to be a soft link, the link is +dereferenced and the working directory is changed to the target of +the link instead. The rename(2) system call, however, +does not follow soft links and renames the link rather than its +target. Other system calls, including open(2), allow +the caller to specify the desired behaviour by passing a flag to the +system call. For yet others there is a second version of the system +call to control the behaviour. For example, lstat(2) is +identical to stat(2), but does not follow soft links.

+ +

It is possible for a soft link to refer to an invalid path. In +fact, ln(1) and symlink(2) do not consider +it an error if the target does not exist, and happily create a soft +link which points to an invalid path. Such soft links are called +dangling or broken. Dangling soft links also occur +when the target file is removed or renamed, or after a mount point +change.

+ +

Soft links may refer to other soft links. System calls which +follow soft links must therefore be prepared to resolve chains of +soft links to determine the file to operate on. However, this is not +always possible because soft links can easily introduce loops into the +hierarchy of files. For example, the commands ln -s foo bar; +ln -s bar foo create such a loop. System calls detect this +and fail with the Too many levels of symbolic links +error when they encounter a loop.

+ +

Another issue with both soft and hard links is that there is no +simple way to find all directory entries which point to the same path +(soft links) or inode (hard links). The only way to achieve this is +to traverse the whole hierarchy of files. This may be prohibitive +for large filesystems, and the result is unreliable anyway unless +the filesystems are mounted read-only.

+ +EXERCISES() + +
    +
  • A path can lack both slashes and components. Give an example + of a path that lacks a slash and another example of a path that has + no components.
  • + +
  • Assume foo is an existing directory. Guess what the + command mv foo bar will do in each of the following cases: + (a) bar does not exist, (b) bar exists and + is a regular file, (c) bar exists and is a directory. + Verify your guess by running the command.
  • + +
  • Many programs check if a path is valid and act differently + according to the result. For example, a shell script might + check for the existence of a file with code like if test + -e "$file"; do something_with "$file"; fi. Explain + why this approach is not bullet-proof. How could this be + fixed?
  • + +
  • Run touch file-{1..100} to create 100 files. Guess + what the following commands will print. Run each command to confirm. + +
      +
    • ls file-
    • +
    • ls file-*
    • +
    • ls file-?
    • +
    • ls file-[1-4]
    • +
    • ls file-[1,3,5,7,9]*
    • +
    +
  • + +
  • Find an extended glob pattern for bash(1) + that matches all valid paths whose last component starts with + file-, followed by any number of odd digits (1, 3, 5, + 7, or 9).
  • + +
  • Point out the flaw in the following shell code: for + f in file-*; do something_with "$f"; done. Hint: Search + bash(1) for "nullglob". + +
  • Create a file named -r with echo > + -r. Try to remove the file with rm -r and + discuss why this doesn't work as expected. Find a way to get rid + of the file. Discuss what happens if you run rm * in a + directory which contains a file named -r.
  • + +
  • The content of the PATH variable is a + colon-separated list of directories in which the shell looks for + commands to execute. Discuss the dangers of including the current + working directory in this list.
  • + +
  • Run id to determine a group G you + belong to but is not your primary group. Consider the following + commands mkdir foo; chgrp $G foo; touch foo/bar. What + is the group ID of foo/bar? Run the same commands, but + insert chmod g+s foo as the second-to-last command.
  • + +
  • Run man null and man zero to learn + about the properties of these two character devices.
  • + +
  • Assume the modification time stored in the inode of some file + suggests that the file was last modified two years ago. How sure + can you be that the file was never changed since then? Hint: See the + -d option of touch(1).
  • + +
  • Run the following commands echo hello > foo, + cat foo, chmod 600 foo, echo world >> + foo. Check the three timestamps with stat foo + after each command.
  • + +
  • Determine the state of the permission bits of your own + home directory by running ls -ld ~. Who can + access its contents? Also look at the permission bits of + other people's home directory.
  • + +
  • A file or directory is called world-writeable + if the w bit is set in the others + class of the permission bits. Create a world-writable + directory with mkdir foo; chmod 777 foo + and create a file in the new directory: echo hello + > foo/bar. Is a different user allowed to create + another file there (echo world > foo/baz)? Can + he remove it again (rm foo/baz)? Will he succeed + in removing foo/bar although it is owned by you + and not writable to him? Try the same with the sticky + bit turned on (chmod 1777 foo).
  • + +
  • Translate rw-r--r-- into octal, and 755 into + rwx-notation.
  • + +
  • Create a hello world script, make it + executable and run it. Create a subdirectory of your home directory + and move the script into this directory. Set the permissions of the + directory to r--------, check whether you still can + list/execute it. Do the same with --x------.
  • + +
  • Create a file with echo hello > foo, + create soft and hard links with ln -s foo soft + and ln foo hard. Examine the inode numbers + and link counts using the command stat foo soft + hard. Remove foo and repeat. Try to + access the file content through both links. Guess what + realpath foo soft hard will print. Run the + command to confirm.
  • + +
  • Create a dangling symlink with ln -s /nope foo. Would + you expect the commands ls -l foo and cat foo + succeed? Run these commands to verify your guess.
  • + +
  • One of the seven Unix file types is symlink. Why is there no + file type for hard links?
  • +
+ +HOMEWORK(« +How many paths are there that refer to the same file? +», « +Given the path /foo/bar, one may construct different paths +which refer to the same file by inserting any number of /. +or ../foo after the first component. For example, +/foo/./bar and /foo/../foo/bar both refer +to the same file. If relative paths have to be taken into account as +well, even more paths can be constructed easily. Hence the answer is: +arbitrary many. + +This illustrates the fundamental difference between a path and a +file. Paths can be mapped to files, but not the other way around. In +particular, there is no such thing like "the list of paths which have +changed since yesterday". + +The concept of hard- and soft links complicates +the situation further. This topic is discussed in a subsequent section. See the exercises +therein for more information. + +») + +HOMEWORK(« +Given two paths, how can one tell if they refer to the same file? +», « + +Among other information, the metadata record of each file contains the +so-called inode number, which uniquely identifies the file +within the file system that contains the file. Therefore, if both +paths are known to refer to files stored on the same file system, +a comparison of the two inode numbers is sufficient to tell whether +the two paths refer to the same file. The inode number can be obtained +with the command ls -i. + +In the general case one additionally has to check that the +two device IDs which identify the underlying file +systems are also identical. Like the inode number, the device ID +is part of the metadata of the file. It can be obtained by running +stat(1). + +») + +HOMEWORK(« +Device nodes come in two flavors: Character and block devices. Explain +the difference between the two device flavors. +») + +HOMEWORK(« + +
    +
  • Nine of the 12 mode bits of each file are the permission + bits. The remaining three are the sticky, setuid + and setgid bits. Explain the purpose of each.
  • + +
  • Run find /usr/bin/ -perm -2000 -ls to see all SUID + executables in /usr/bin. Discuss why those programs have + the SUID bit set.
  • +
+ +») + +HOMEWORK(« +How many possible permission modes exist for a file or directory on +a Unix System? +», « +There are nine permission bits that can be turned on and off +independently. Hence we have 2^9=512 possibilities. When taking into +account the three special bits (sticky, setuid, setgid), the number +increases to 2^12=4096. +») + +HOMEWORK(« +Explain each command of the script +below. Show the arrangement of all files and links in a figure, +drawing a directory as a circle and a file as a square. How many +different paths exist for the file a? Discuss whether +the question "What's the path of a given file?" makes sense. + +», « + +
+ + + + + + foo + + a + + testdir + + + + +
+ +Since foo/a, foo/testdir/a, +foo/testdir/testdir/a etc. all refer to the same file, there +are infinitely many paths for the file a. Hence the +question makes no sense: There is no such thing as the +path to a file. + +») + +HOMEWORK(« + +Recall that the path component .. refers to the +parent directory. Give an example of a path to a directory where +"parent directory" and "directory identified by the path obtained by +removing the last path component" are different. Which of the two +interpretations of .. does bash apply when you type +cd .. at the bash prompt? + +») + +HOMEWORK(« + +Is it possible to choose among all possible paths which refer to the +same file a canonical path? That is, a shortest (counting +characters) absolute path which does not contain any soft links? + +», « + +

The POSIX standard requires each Unix system library to provide +the realpath() function which performs the following +substitutions on the given path: First, the path to the current +working directory is prepended if the given path is relative +(does not begin with a slash). Second, symbolic links are replaced +by their targets. Third, any occurrences of /. and +foo/.. are removed. The thusly transformed path is +returned by the function as the canonical path.

+ +

Although each path can be canonicalized in this way, not all paths +which refer to the same file give rise to the same canonical path. For +example, /tmp/foo and /tmp/bar could refer +to regular files which are hard links of each other. In this case the +paths refer to the same file, yet the paths are different and already +canonicalized. The same can happen when a file system (or a subtree +of it) is bind mounted. That is, the file system tree is +visible at two or more locations in the global directory tree.

+ +The message of this exercise is to convince the reader that it is +incorrect to assume that two files are different because their paths +are different. + +») + +SECTION(«Processes») + +

A program consists of instructions and data stored in +a regular file. A user process is an instance of a running +program. This is in contrast to kernel processes which are +created directly by the kernel and have no relationship to executable +files. Since we shall only be concerned with user processes, we will +refer to these as "processes" from now on. In this section we'll see +how processes are created and removed. We will then take a closer look +at the enviroment of a process and discuss how processes communicate +with each other.

+ +SUBSECTION(«Process Tree, Zombies and Orphans») + +

When the system boots, there is only one process, the +init process, which is created by the kernel at the end +of the boot sequence by executing /sbin/init. All +other processess are created from existing processes by means of +the fork(2) system call. The process which called +fork(2) is said to be the parent of the newly +created child. After fork(2) returns, both +parent and child are executing independently of each other. Both +processes may call fork(2) again to create further +processes. This gives rise to a tree structure where the processes +are the nodes of the tree with init being the root node. The edges +of the tree describe the parent-child relationships.

+ +

If there are more processes than CPUs, not all processes can +run simultaneously. It is the mission of the kernel's task +scheduler to assign processes to CPUs and to perform context +switches. That is, to take away the CPU from a running process +in order to give another process the chance to run. The scheduler +has to choose the duration of each process' time slice and it must +pick the process to switch to when the time slice of the current +process has expired or the process gives up the CPU voluntarily, for +example because it is waiting for an I/O operation to complete. This +is a non-trivial task at least for modern multi-CPU systems with +non-uniform memory access (NUMA) where the memory access times +depend on the memory location and the processor core. Things don't +get easier if the CPU clock speed can vary and/or scheduling must be +power-aware to optimize battery time. To make good decisions, some +information has to be provided by the processes or by a system-wide +policy. One elementary way to prioritize certain processes over others +is via nice levels which we shall discuss below.

+ +

The normal way for a process to terminate is to call +exit(3) after it has done its work. This function +transfers an integer value, the exit status, to the +kernel. The exit status can only be retrieved by the parent of +the terminating process. To illustrate this concept, imagine an +interactive shell which creates one child each time the user enters +a command. The children are short living while the parent, the shell, +stays around for much longer. During command execution the parent needs +to wait for its child to terminate. It may then want to tell whether +the child has terminated successfully. To achieve this, the parent +calls one of the wait system calls (wait(2), +waitpid(2), waitid(2)) which block until the +child has terminated, then return the child's exit status. After the +child called exit(3) but before the parent has called +one of the wait functions, the kernel needs to keep at least the +exit status (and possibly further information) about the terminated +child. During this time window the child has already terminated but +a part of it still exists in kernel memory. Processes in this state +are aptly called zombies.

+ +

Unlike in the shell scenario outlined above, a process might well +have any number of children at the time it terminates. Its children +then become orphans as they lose their parent. The kernel +cannot simply remove the terminated process from the process tree +because this would disconnect its orphaned children from the other +processes in the tree, destroying the tree structure. To avoid this, +orphans are reparented to init, that is, made children +of the init process. This works because the init process never +terminates.

+ +

There are several programs which show information about +processes. The POSIX ps(1) command prints a list of +processes. It has many options that control which processes are +shown and how the output is formatted. Similar programs which are not +covered by POSIX are pstree(1), top(1) and +htop(1). The former shows the tree structure while the +latter two provide a dynamic real-time view of the process tree. The +exercises of this section invite the reader to become familiar with +these essential programs.

+ +SUBSECTION(«File Execution») + +

When a process calls fork(2), the newly created +child process starts out as a copy of the calling process. However, +the reason to create a new process is usually to let the child do +something different than the parent. Therefore, fork(2) +is often followed by a second system call which replaces the child +process with a different program. There are several similar system +calls which do this, with slight semantic differences. We refer to +this family of system calls as the exec system calls.

+ +

All exec system calls receive a path argument from which they +determine an executable file that contains the program to run. Linux +and BSD store executables in Executable and Linkable Format +(ELF). Executables are typically linked dynamically. +That is, the dependent libraries (at least libc, but possibly many +more) are loaded at runtime from separate files. This is in contrast +to static linking which includes all dependencies in the +executable, making the executable self-contained but also larger +and harder to maintain. Regardless of the type of linking, when the +program is loaded, it completely replaces the previously running +program. Note that there can be more than one process at the same +time which executes the same program.

+ +

Files in ELF format are called native executables because +they contain machine instructions which can be executed directly +by the CPU. Another type of executables are scripts, +also known as interpreter files. Scripts are text files +which start with the shebang (#!). They can +not run directly but have to be interpreted at runtime +by another program, the interpreter. Nevertheless, it is +possible to execute a script as if it were a native executable: +by passing the path to one of the exec system calls or by typing +the path at the shell prompt. The exec system call recognizes that +the given file is a script by investigating the first line, which +contains the path to the interpreter after the shebang, optionally +followed by options to the interpreter. The kernel then executes the +interpreter rather than the script, passing the path to the script as +an argument. For example, if /foo/bar is being executed, +and the first line of this file is #!/bin/sh, the kernel +executes /bin/sh /foo/bar instead. Popular interpreters +besides /bin/sh include /bin/bash, +/usr/bin/perl, /usr/bin/python and +/usr/bin/awk.

+ +SUBSECTION(«File Descriptions and File Descriptors») + +

The kernel must always be aware of the set of all objects which are +currently in use. This set is often called the system-wide table +of open files although not all entries refer to files. In fact, an +entry may refer to any object that supports I/O operations, for example +a network socket. Each entry is called a file description, +which is a somewhat unfortunate term that was coined by POSIX. A +file description records information about the object itself as well +as the current state of the reference, including the file offset, +if applicable, and the status flags which affect how future +I/O operations are going to be performed through this reference.

+ +

The kernel maintains for each process an array of pointers to file +descriptions. Each such pointer is a file descriptor. Unlike +files and file descriptions, a file descriptor always corresponds +to a process and is identified by a non-negative number, the index +into the pointer array of that process. This index is returned by +system calls like open(2) or socket(2). +As far as user space programs are concerned, a file descriptor is +synonymous with this integer. It can be regarded as an abstract +handle that must be supplied to subsequent I/O operations +like read(2) or write(2) to tell the system +call the target object of the operation.

+ +

The shell automatically creates three file descriptors for each +process which are identified by the integers 0, 1 and 2. They are +called stdin, stdout, and stderr, which is +short for standard input/output/error. It is possible, and in +fact common, that all three file descriptors point to the same file +description: the terminal device. Many command line tools read their +input from stdin, write normal output to stdout, and error messages +to stderr. For example, when the POSIX command cat(1) +is invoked with no arguments, it reads data from stdin and writes +the same data to stdout.

+ +SUBSECTION(«Signals») + +

Signals are another ancient Unix concept that dates back to the +early 1970s and was standardized in POSIX long ago. This concept +facilitates a rudimentary form of inter process communication +(IPC) between unrelated processes. Signals can be regarded as software +interrupts that transmit a notification event from the sending process +to the target process. The event is sent asynchronously, +meaning that the interruption can happen at any location of the code +flow.

+ +

It is fair to say that most non-trivial programs, including +scripts, have to deal with signals. All major scripting languages +(bash, python, perl, ruby, etc.) provide an API for signal +handling. The interpreter of the scripting language ends up calling +the POSIX system functions, so we will only look at these.

+ +

Signals are identified by name or by a numerical ID. For example, +SIGINT (interrupt from keyboard) is the name for signal +number 2. POSIX defines 31 standard signals plus at least +eight real-time signals. The standard signals can be +subdivided according to the origin of the signal as follows.

+ +
+
hardware related signals
+ +
These signals originate from hardware traps that force + the CPU back into kernel mode. The kernel responds to the trap by + generating a signal for the process that caused the trap. For example, + a division by zero in a user space program triggers a hardware trap + in the floating point unit (FPU) of the CPU. The kernel + then generates the SIGFPE (floating-point exception) + signal for the process. Another example for a signal that originates + from a hardware trap is SIGSEGV (segmentation fault) + which occurs when a process attempts to reference a memory address + that has not been mapped (i.e., marked as valid) by the memory + management unit (MMU) of the CPU.
+ +
kernel generated signals
+ +
Signals which originate from the kernel rather than from + hardware. One example is SIGCHLD (child terminated), + which is sent to the parent process when one of its child processes + terminates. Another example is SIGWINCH (window resize), + which is generated when the geometry of the controlling terminal of + a process changes.
+ +
user-space generated signals
+ +
These signals can only originate from user space when a process, + for example kill(1), calls raise(2) + or kill(2) to instruct the kernel to generate a + signal. Examples are SIGTERM, which issues a termination + request, and SIGUSR1 and SIGUSR2 which are + reserved for use by application programs.
+
+ +The following signals are used frequently and deserve to the described +explicitly. We refer to signal(7) for the full list of +signals and their semantics. + +
+
SIGINT, SIGTERM and SIGKILL
+ +
All three signals terminate the process by default. + SIGINT is generated for the foreground processes + when the interrupt character (CTRL+C) is pressed in a + terminal. For example, if CTRL+C is pressed while the shell pipeline + find | wc is executing, SIGINT is sent + to both processes of the pipeline. SIGTERM is the + default signal for the kill(1) command. It requests + the target process to run its shutdown procedure, if any, then + terminate. SIGKILL instructs the kernel to terminate the + target process immediately, without giving the process the chance to + clean up. This signal can originate from a process or from the kernel + in response to running out of memory. To keep the system working, the + kernel invokes the infamous out of memory killer (OOM killer) + which terminates one memory-hungry process to free some memory.
+ +
SIGSTOP, SIGTSTP and SIGCONT
+ +
SIGSTOP instructs the task scheduler of the kernel to + no longer assign any CPU time to the target process until the process + is woken up by a subsequent SIGCONT. SIGTSTP + (stop typed at terminal) stops all foreground processes of a terminal + session. It is generated when the stop character (CTRL+Z) + is pressed in a terminal.
+
+ +

Processes may set the signal disposition of most signals +to control what happens when the signal arrives. When no disposition +has been set, the signal is left at its default disposition so +that the default action is performed to deliver the signal. +For most signals the default action is to terminate the process, +but for others the default action is to ignore the signal. +If the signal is neither ignored nor left at its default disposition, +it is said to be caught by the process. To catch a signal the +process must tell the kernel the address of a function, the signal +handler, to call in order to deliver the signal. The set of +signal dispositions of a process can thus be imagined as an array +of function pointers with one pointer per possible signal. If the +process catches the signal, the pointer points to the corresponding +signal handler. A NULL pointer represents a signal that was left at +its default disposition while the special value SIG_IGN +indicates that the process explicitly asked to ignore this signal.

+ +

Signals can also be blocked and unblocked. When +a signal is generated for a process that has it blocked, it remains +pending. Pending signals cause no action as long as the +signal remains blocked but the action will be performed once the +signal gets unblocked. SIGKILL and SIGSTOP +can not be caught, blocked, or ignored.

+ +

Real-time signals are similar to SIGUSR1 and +SIGUSR2 in that they have no predefined meaning but +can be used for any purpose. However, they have different semantics +than the standard signals and support additional features. Most +importantly, real-time signals are queued, meaning that in +contrast to standard signals the same real-time signal can be pending +multiple times. Also, the sending process may pass an accompanying +value along with the signal. The target process can obtain this +value from its signal handler along with additional information like +the PID and the UID of the process that sent the signal.

+ +

Some system calls including read(2) and +write(2) may block for an indefinite time. For +example, reading from a network socket blocks until there is data +available. What should happen when a signal arrives while the process +is blocked in a system call? There are two reasonable answers: Either +the system call is restarted, or the call fails with the +Interrupted system call error. Unfortunately, different +flavors of Unix handle this case differently by default. However, +applications may request either behaviour by setting or clearing the +SA_RESTART flag on a per-signal basis.

+ +SUBSECTION(«Environment of a Process») + +

Now that we have a rough understanding of processes we look +closer at the information the kernel maintains for each process. We +already discussed the array of file descriptors and the array of +signal dispositions. Clearly both are process specific properties. +As we shall see, there is much more what constitutes the environment +of a process.

+ +

Each process is identified by a unique process ID +(PID), which is a positive integer. The init process is +identified by PID 1. PIDs are assigned in ascending order, but are +usually restricted to the range 1..32767. After this many processes +have been created, PIDs wrap and unused PIDs are recycled for new +processes. Thus, on a busy system on which processes are created and +terminated frequently, the same PID is assigned to multiple processes +over time.

+ +

Not all processes call fork(2) to create a child +process, but each process except the init process has a unique +parent. As described before, this is either the "real" parent (the +process which created the process earlier) or the init process that +"adopted" the orphaned process in case the real parent terminated +before the child. The process ID of the parent (PPID) is thus +well-defined. A process can obtain its PID and PPID with the +getpid(2) and getppid(2) system calls.

+ +

Each process runs on behalf of a user (possibly the superuser) +which is identified by its user ID (UID) and belongs to +one or more groups, identified by one or more group IDs +(GIDs). The superuser is identified by UID zero. When we talked +about the permission bits of files and directories, we said that +suitable permissions are needed for system calls which operate on +files (open(2), stat(2), etc.). A more +precise statement is that the process which calls, say, +open(2) needs to have these permissions. To decide this, +the kernel needs to take into account the UID and GIDs of the process +that called open(2), the UID and the GID stored in the +inode of the file that is being opened, and the permission bits of +this file. The UID is also taken into account for kill(2) +because unprivileged processes (non-zero UID) can only send signals +to processes which run on behalf of the same user while the superuser +may target any process.

+ +

Each process has a current working directory (CWD) +associated with it. When the user logs in, the CWD of the login shell +process is set to his home directory, which should always +exist and have the read, write and execute permission bits set for +the user. The CWD can later be changed with chdir(2) +and be retrieved with getcwd(3). The CWD is used as the +starting point for path searches for relative paths. It affects most +system calls which receive a path argument. For example, if the CWD +is /foo/bar and the relative path baz/qux +is passed to open(2), the kernel will attempt to open +the file which is identified by /foo/bar/baz/qux.

+ +

Many programs accept arguments to control their behavior. +In addition to the path to the program that is to be executed, +all variants of the exec system calls receive an array of arguments +called the argument vector. For example, when the command +ls -l foo is executed, the argument vector contains +the two strings "-l" and "foo". Note that +the argument vector is not part of the program but is tied to the +process. It is passed to the main function at startup so that the +program may evaluate it and act accordingly.

+ +

Another way to pass information to a program is via environment +variables. Environment variables are strings of the form +name=value. POSIX describes the API to maintain the +environment variables of a process. Environment variables are set +with setenv(3) or putenv(3), the value of a +variable can be retrieved with getenv(3), and a variable +and its value can be deleted with unsetenv(3). The set of +environment variables is sometimes called the environment +of the process, although we use this term in a broader sense to +describe the entirety of all metadata maintained by the kernel about +the process, including but not limited to environment variables.

+ +

Each process also has about a dozen resource limits +that can be set and queried with the POSIX setrlimit(2) +and getrlimit(2) functions. Each limit controls a +different aspect. For example, RLIMIT_CPU limits the +CPU time the process is allowed to use and RLIMIT_NOFILE +controls how many open files it may have at a time. For each resource +there is a soft and a hard limit. The kernel +enforces the value stored as the soft limit. This value may be set +by an unprivileged process to any value between zero and the hard +limit. Unprivileged processes may also reduce (but not increase) their +hard limits. Once a hard limit is reduced, it can not be increased +any more. For RLIMIT_CPU a special convention applies: +If the soft limit is reached, the kernel sends SIGXCPU +(CPU time limit exceeded) to notify the process about this fact so +that it can terminate orderly (e.g., remove temporary files). When +the hard limit is reached, the kernel terminates the process as if +it received SIGKILL.

+ +

The nice level of a process provides a hint for +the task scheduler to give the process a lower or higher priority +relative to other processes. Nice levels range between -20 and 19. A +high nice level means that the process wants to be nice to other +processes, that is, should run with reduced priority. Low nice levels +indicate important processes that should be prioritized over other +processes. The default nice level is zero. Unprivileged users may +set the nice level of new processes with the nice(1) +command to any non-negative value. They can also increase the nice +level of their existing processes with renice(1), but +never decrease it. The superuser, however, may set the nice level +of any process to an arbitrary value in the valid range.

+ +

The bulk of the properties discussed so far are inherited by the +child after a fork(2). Specifically, the child gets the +same array of file descriptors and signal dispositions as its parent, +runs on behalf of the same user, has the same working directory, +the same resource limits and nice level, and also the same set +of environment variables with identical values. The PID and PPID, +however, are different of course.

+ +

After a process has called an exec function to replace itself with +a new program, its signal handlers no longer exist because they were +part of the program code which has been replaced. Therefore the exec +system calls reset the disposition of all signals that were caught to +the default disposition. Signals that were being ignored keep being +ignored, however.

+ +SUBSECTION(«The Process Filesystem») + +

Although not covered by POSIX, at least Linux, NetBSD and FreeBSD +provide information about processes via the process filesystem +(procfs), which is usually mounted on /proc. The process +filesystem is a pseudo-filesystem, i.e., it has no underlying +storage device. Files and directories are faked by the kernel as they +are accessed. Each process is represented by a numerical subdirectory +of /proc which is named by the PID. For example, +/proc/1 represents the init process. The aforementioned +process utilities (ps(1), top(1), etc.) read +the contents of the process filesystem in order to do their job.

+ +

Each /proc/[pid] directory contains the same set +of files although this set is different between Unixes. These files +expose much of the environment of the process to user space. The Linux +procfs implementation provides text files named environ +and limits which contain the current environment and +the resource limits of the process, respectively. Moreover, the +file descriptor array of each process is exposed in the files of +the /proc/[pid]/fd directory. Linux and NetBSD (but not +FreeBSD) also provide a cwd soft link which points to +the current working directory of the process.

+ +SUBSECTION(«Pipes and Redirections») + +

The pipe(2) system call takes no arguments and +creates two file descriptors for the calling process which are tied +together as a unidirectional first in, first out data channel that +works just like a fifo, but without any files being involved. One +file descriptor is the read end of the pipe, the other is +the write end. Data written to the write end is buffered by +the kernel and can be obtained by reading from the read end.

+ +

One application of pipes is communication between +related processes. A process first creates a pipe, then calls +fork(2) to create a child process. The child inherits +a copy of both pipe file descriptors. Hence the parent process can +communicate with the child by writing a message to the write end of +the pipe for the child to read.

+ +

The POSIX dup(2) and dup2(2) system +calls allow a process to manipulate the entries of its file descriptor +array. In particular the standard file descriptors 0, 1, and 2 can be +replaced. By doing so before performing an exec system call, it can +be arranged that the replacement program starts with, say, its stdout +file descriptor be redirected to the write end of a pipe. Note that +the replacement program does not need any modifications for this to +work, and might not even be aware of the fact that it is not writing +its output to the terminal as usual.

+ +

Shells employ this technique to implement the | +operator which "pipes" the output of one command "into" another +command. For example, the pipeline ls | wc works +as follows: First the shell creates a pipe, then it calls +fork(2) twice to create two processes which both +get a copy of the two pipe file descriptors. The first process +replaces its stdout file descriptor with the write end of the +pipe and performs an exec system call to replace itself with the +ls(1) program. The second process replaces its stdin +file descriptor with the read end of the pipe and replaces itself +with wc(1). Since ls(1) writes to stdout +and wc(1) reads from stdin, wc(1) processes +the output of ls(1).

+ +

Note that this trick does not work to establish a connection +between two existing processes because it depends on file +descriptor inheritance across fork(2). In the general +case one has to fall back to sockets or fifos to create the data +channel.

+ +SUBSECTION(«Stdio») + +

The POSIX standard requires a compliant Unix system to provide +a collection of functions that let applications perform input and +output by means of operations on streams. This programming +interface, known as stdio for standard input/output, +is part of every Unix system since 1979. Every program which contains +a printf(3) statement relies on stdio.

+ +

The stdio functions are implemented as part of libc on top of the +open(2), read(2) and write(2) +system calls which are implemented in the kernel. Roughly speaking, +stdio replaces the file descriptor API by a more abstract API +which centers around streams. A stream is an opaque data structure +which comprises a file descriptor and an associated data buffer for +I/O. Each program has three predefined streams which correspond to +the three standard file descriptors (stdin, stdout and stderr). The +stdio API contains well over 50 functions to create and maintain +streams and to perform I/O on streams. These functions take care of +the characteristics of the underlying file description. For example, +they automatically try to select the optimal I/O buffer size.

+ +

Many applications rely on stdio because of convenience. For +one, the buffers for read(2) and write(2) +must be allocated and freed explicitly by the application, and care +must be taken to not overflow these buffers. With stdio, this task +is done by the stdio library. Second, formatted I/O is +much easier to do with the stdio functions because the programmer +only has to provide a suitable format string to convert +between the machine representation and the textual representation of +numbers. For example, by passing the format string "%f" +to scanf(3), the programmer tells the stdio library to +read a floating point number stored in textual representation from the +specified stream, convert it to machine representation and store it +in the given variable. The fprintf(3) function works the +other way round: the value is converted from machine representation +to text, and this text is written to the stream. Both functions can +deal with various formats, like scientific notation for floating +point numbers (e.g., 0.42E-23). With stdio it is easy to customize +the formatted output, for example add leading zeros or select the +number of decimal digits in the textual representation of a floating +point number.

+ +

Another reason why many programs rely on stdio is that it performs +buffered I/O. With buffered I/O not each read/write operation +results in a system call. Instead, data read from or written to the +stream is first stored in the user space buffer that is associated +with the stream. This is a performance improvement for applications +which perform many small I/O operations because every system call +incurs some overhead. Buffers may be flushed explicitly by +calling fflush(3), or implicitly by the stdio library. The +criteria which cause an implicit flush depend on the buffering +type of the stream. Three buffering types exist.

+ +
+
unbuffered
+ +
The stdio library does not buffer any reads or writes. Instead, + each I/O operation results in a read(2) or + write(2) system call. By default the stderr stream is + unbuffered to display error messages as quickly as possible.
+ +
line buffered
+ +
System calls are performed when a newline character is + encountered. This buffering type applies by default to interactive + sessions where the file descriptor of the stream refers to a terminal + device (as determined by isatty(3)).
+ +
fully buffered
+ +
I/O takes place only if the buffer of the stream is empty/full. By + default, if the file descriptor refers to a regular file, the + stream is fully buffered. POSIX requires that stderr is never fully + buffered.
+
+ +

The exercises on stdio focus on the three different buffering +types because this is a common source of confusion.

+ +SUBSECTION(«The Virtual Address Space of a Process») + +

Isolation refers to the concept that each process gets its own +virtual address space. A rough understanding of the memory +management system and the layout of the virtual address space of +a process helps to locate the source of common problems like the +infamous segmentation fault error, and to realize that +putatively simple questions such as "how much memory is my process +currently using?" are in fact not simple at all, and need to be made +more precise before they can be answered.

+ +
+ +define(«vas_width», «200») +define(«vas_height», «300») +define(«vas_vmem_left_margin», «5») +define(«vas_vmem_top_margin», «5») +define(«vas_mem_width», «20») +define(«vas_gap_width», «30») +define(«vas_vmem_height», «140») +define(«vas_vmem_color», «#34b») +define(«vas_pmem_height», «100») +define(«vas_pmem_color», «#7e5») +define(«vas_vmem_unmapped_color», «#a22») +define(«vas_vmem_swapped_color», «yellow») +define(«vas_pmem_unavail_color», «orange») +define(«vas_disk_gap», «15») +define(«vas_disk_height», «20») +define(«vas_disk_color», «grey») +define(«vas_x1», «vas_vmem_left_margin()») +define(«vas_x2», «eval(vas_x1() + vas_mem_width())») +define(«vas_x3», «eval(vas_x2() + vas_gap_width())») +define(«vas_x4», «eval(vas_x3() + vas_mem_width())») + +define(«vas_membox», « + +») +define(«vas_vmem_unmapped_box», « + + +») +define(«vas_vmem_swapped_box», « + + +») +define(«vas_pmem_unavail_box», « + + +») +define(«vas_vmem_hline», « + + +») + +define(«vas_pmem_hline», « + «» + + +») +define(«vas_arrow», « + +») +define(«vas_disk», « + + + +») + + + + + + vas_membox(«vas_vmem_color()», «vas_vmem_height()», «0») + vas_membox(«vas_pmem_color()», «vas_pmem_height()», + «eval(vas_gap_width() + vas_mem_width())») + vas_vmem_hline(«10») + vas_vmem_hline(«40») + vas_vmem_unmapped_box(«40», «20») + vas_vmem_swapped_box(«60», «60») + + vas_pmem_unavail_box(«0», «10») + vas_pmem_hline(«20») + vas_pmem_unavail_box(«20», «30») + vas_pmem_hline(«80») + + vas_arrow(«5», «15») + vas_arrow(«25», «65») + vas_arrow(«130», «90») + vas_disk() + vas_arrow(«90», «eval(vas_pmem_height() + vas_disk_gap() + + vas_disk_height() / 2)») + +
+ +

Virtual memory is an abstraction of the available memory resources. +When a process reads from or writes to a memory location, it refers +to virtual addresses (illustrated as the left box of the +diagram). Virtual addresses are mapped by the MMU to physical +addresses which refer to physical memory locations (right +box). The mapped virtual address space of a process is a +collection of ranges of virtual addresses which correspond to physical +memory addresses (blue areas). By storing less frequently-accessed +chunks of virtual memory (yellow) on the swap area (grey), applications +can use more memory than is physically available. In this case the +size of the valid virtual addresses (blue and yellow areas together) +exceeds the amount of physical memory (orange and green areas). Any +attempt to access an unmapped memory location (red and yellow areas) +results in a page fault, a hardware trap which forces the CPU +back into kernel mode. The kernel then checks whether the address is +valid (yellow) or invalid (red). If it is invalid, the kernel sends +SIGSEGV, which usually terminates the process with +the segmentation fault error. Otherwise it allocates +a chunk of unused physical memory, copies the chunk from the swap +area to the newly allocated memory and adjusts the mapping (i.e., +a yellow part becomes blue). The virtual memory concept increases +stability and security because no process can access physical memory +which belongs to the kernel or to other processes (orange areas).

+ +

We've already seen that the fork(2) system call +creates a new process as a duplicate of the calling process. Since +the virtual address space of the calling process (a) might be large +and (b) is likely to be replaced in the child by a subsequent call +to an exec function, it would be both wasteful and pointless to +copy the full address space of the parent process to the child. To +implement fork(2) efficiently, operating systems +employ an optimization strategy known as Copy on Write +(CoW). The idea of CoW is that if multiple callers ask for resources +which are initially indistinguishable, you can give them pointers to +the same resource. This function can be maintained until a caller +tries to modify its copy of the resource, at which point a true +private copy is created to prevent the changes becoming visible to +everyone else. The primary advantage is that if a caller never makes +any modifications, no private copy needs ever be created. The +fork(2) system call marks the pages of the virtual address +space of both the parent and the child process as CoW by setting a +special bit in the page table entry which describes the +mapping between virtual and physical addresses of the MMU. As for +invalid memory accesses, the attempt to write to a CoW page results +in a page fault that puts the CPU back into kernel mode. The kernel +then allocates a new memory page on behalf of the process, copies +the contents of the page which caused the fault, changes the page +table mappings for the process accordingly and returns to user space. +This all happens transparently to the process.

+ +
+define(«asl_width», «300») +define(«asl_height», «400») +define(«asl_top_margin», «10») +define(«asl_text_width», «35») +define(«asl_mem_width», «25») +define(«asl_mem_color_env», «#fc8») +define(«asl_mem_color_stack», «#8fc») +define(«asl_mem_color_empty», «#ccc») +define(«asl_mem_color_heap», «#c8f») +define(«asl_mem_color_bss», «#8cf») +define(«asl_mem_color_data», «#cf8») +define(«asl_mem_color_text», «#f8c») +define(«asl_font_size», «5») + +define(«asl_arrow», « + +») +define(«asl_arrow_text», « + + $2 + +») + +dnl $1: y0, $2; height, $3: color, $4: high arrow text +dnl $5: low arrow text, $6: desc + +define(«asl_box», « + + ifelse(«$4», «», «», « + asl_arrow(«eval($1 + asl_top_margin())») + asl_arrow_text(«eval($1 + asl_top_margin() - 2)», «$4») + ») + ifelse(«$5», «», «», « + asl_arrow(«eval($1 + $2 + asl_top_margin())») + asl_arrow_text(«eval(asl_top_margin() + + $1 + $2 - 2)», «$5») + ») + + $6 + +») + + + asl_box(«0», «10», «asl_mem_color_env», «2^64 - 1», «», + «Environment») + asl_box(«10», «15», «asl_mem_color_stack», «», «base pointer», + «Stack») + asl_box(«25», «30», «asl_mem_color_empty», «», «break point», + «Empty») + asl_box(«55», «35», «asl_mem_color_heap», «», «», «Heap») + asl_box(«90», «10», «asl_mem_color_bss», «», «», «BSS») + asl_box(«100», «10», «asl_mem_color_data», «», «», «Data») + asl_box(«110», «10», «asl_mem_color_text», «», «0», «Text») + +
+ +

The diagram on the left illustrates the layout of the virtual +address space of a process. At the top of the address space are the +argument vector and the environment variables. The stack +stores the local variables of the functions which are currently +being called, plus house-keeping data like the return addresses +of these functions. As more functions are called, the stack grows +downwards towards the lower addresses. Its current lower end is +called the base pointer. The other variable area of the +address space is the heap, which contains the memory that +has been allocated on behalf of the process, for example with +malloc(3). As the process allocates more memory, the heap grows +upwards, towards the stack. The current end of the heap is called the + break point. The lower part of the address space contains +three segments of fixed size. The text segment contains the +compiled machine instructions of the executable, the data +segment contains the initialized variables which are already known +at compile time. Finally, the BSS segment is allocated and +zeroed at execution time. This segment contains variables which should +initialized to zero at startup. Unlike the data segment it is not +stored in the executable. BSS stands for "Block Started by Symbol", +which is a historic name coined in the 1950s. It has no relation to +the real meaning of the segment.

+ +The exercises of this section invite the reader to look at the virtual +address space of running processes to learn what happens when a +dynamically-linked executable is being executed and how the resulting +memory maps affect the virtual address space of the newly created +process. + +EXERCISES() + +
    +
  • Examine your own processes with htop, ps + ux and pstree -panuch $LOGNAME.
  • + +
  • Run ls -l /proc/$$ and examine the environment of + your shell process.
  • + +
  • Run kill -l and discuss the meaning of signals + 1-15. Use signal(7) as a reference.
  • + +
  • Create a zombie process: run sleep 100&. From + another terminal, send SIGSTOP to the parent process + of the sleep process (the shell), then send SIGKILL + to the sleep process. Run cat /proc/$PID/status where + $PID is the process ID of the sleep process.
  • + +
  • Run echo $$ to obtain the PID of an interactive + shell that is running in a terminal. Send the SIGSTOP + and SIGCONT signals to this PID from another terminal + and see what happens when you type in the terminal that contains the + stopped shell process.
  • + +
  • The ping(8) utility catches SIGQUIT. + In one terminal execute ping localhost. While this + command runs in an endless loop, send SIGQUIT to the + ping process from another terminal and see what happens.
  • + +
  • Read kill(2) to learn what kill -9 -1 + does. Run this command if you are brave.
  • + +
  • Why doesn't the cd script work as + expected?
  • + +
  • Explain the difference between the two commands X=foo + bar and X=foo; bar.
  • + +
  • Run set and examine the environment variables of + an interactive shell session.
  • + +
  • Check this email + from Linus Torvalds about why stdio is not that simple at all.
  • + +
  • Run the command ls / /does-not-exist, redirect + stdout and stderr to different files.
  • + +
  • Consider the following shell code which uses stdio to first write + to stdout, then to stderr. echo foo; echo bar 1>&2. Which + circumstances guarantee that the "foo" line appears before the "bar" + line in the output?
  • + +
  • In the pipeline foo | bar, what is the + buffering type of the file descriptor which corresponds to + the stdin stream of the bar + process?
  • + +
  • Assume foo is a log file which increases due to + some process appending data to it. Explain why the command + tail -f foo | while read; do echo new data; done does not + work as expected. Fix the flaw by changing the buffering type with + stdbuf(1).
  • + +
  • Run sleep 100 > /dev/null &, examine open + files of the sleep process by looking at suitable files in + /proc. Do the same with sleep 100 | head + &.
  • + +
  • Run ldd /bin/sh and explain what happens when a + shell is executed.
  • + +
  • On a Linux system, run cat /proc/$$/maps or + pmap -x $$ to see the address space layout of your + shell. Check Documentation/filesystems/proc.txt + in the linux kernel source tree for the format of + /proc/$$/maps.
  • + +
  • Run cat /proc/$$/smaps and examine the values of + the heap section.
  • + +
  • Assume some program allocates a lot of memory so that the size of + the valid virtual addresses is 1T large. Assume further that a software + bug causes the content of a pointer variable to be overwritten with + random garbage. Determine the probability that this pointer variable + contains a valid address (assuming a 64 bit system).
  • +
+ +HOMEWORK(« + +Explain how PR_SET_CHILD_SUBREAPER works and possible +use-cases for this (Linux) feature. + +») + +HOMEWORK(« + +Explain in one paragraph of text the purpose of the file +creation mask (also known as umask) of a process. + +») + +HOMEWORK(« + +When we said that each process runs on behalf of a user and that the +ID of this user is part of the process metadata, we were simplifying +matters. There are actually three different UIDs and three different +GIDs: the real UID, the effective UID, and the +saved set-user ID, and analogous for the group IDs. Explain +the purpose of the three UIDs. + +») + + +HOMEWORK(« + +On a multi-CPU system the performance of a program can be +enhanced by allowing for multiple flows of control. This is the +idea behind threads, which are also called lightweight +processes. Give an overview of threads, summarize the POSIX thread +API (see pthreads(7)) and explain how the Linux-specific +clone(2) system call can used to implement threads. + +») + +HOMEWORK(« + +Explain what the command find /etc > /dev/null does, +and why you get some error messages. Assume you'd like to extract +only those error messages which contain the string "lvm". Explain +why find /etc > /dev/null | grep lvm does not work as +expected. Come up with a similiar command that works. + +», « + +The command traverses the /etc directory recursively and +prints all files and directories it encounters during the traversal to +stdout. Since stdout is redirected to the NULL device by the > +/dev/null construct, only the stderr stream containing the error +messages makes it to the terminal. This includes all subdirectories +of /etc which cannot be traversed due to insufficient +permissions (no "r" bit set). The proposed find | grep +command does not work since the | operator is evaluated +before any redirections specified by the find command +take place. More precisely, stdout of the find process is redirected +twice: First to one end of the pipe due to the |, +then to the NULL device due to the > /dev/null. The +last redirection "wins", so the grep process does not +see any input. The command find /etc 2>&1 > /dev/null | grep +lvm works. The following four redirections take place: First +stdout of the find process and stdin of grep +process are redirected to the two ends of the pipe. Next, due to +the 2>&1 the stderr stream of the find +process is redirected to the current destination of stdout, i.e., +to the pipe. Finally the > /dev/null redirects stdout +of the find process to the NULL device. Hence error messages go to +the pipe and are processed by grep as desired. + +») + +HOMEWORK(« +Run ulimit -n to see the maximal number of file descriptors you +are allowed to create. Explain what this limit means with respect +to multiple processes, multiple logins, and the fork(2) system +call. Write a program in your language of choice which creates file +descriptors in a loop until it fails due to the file descriptor +limit. Then print the number of file descriptors the program was able +to create. +», « +On our systems the limit is set to 1024. This means a single process +can only have this many files open at any given time. Independent +processes (like those coming from different login sessions) have no +common file descriptors, even though they may open the same files. In +this sense the file descriptor limit is a per-process limit. However, +when a process calls «fork(») to create a new process, the new +process inherits all open file descriptors from the parent. This can +lead to the situation where a newly created process is unable to open +any files. This property was actually used to break computer +security. The «O_CLOEXEC» flag was introduced not too long +ago to deal with this problem. See open(2) for details. + +C program that opens the maximal possible number of file descriptors: + +
+	int main(void)
+	{
+		int i;
+
+		for (i == 0; open("/dev/null", O_RDONLY) >= 0; i++)
+			;
+		printf("opened %d file descriptors\n", i);
+		exit(0);
+	}
+
+») + +HOMEWORK(« + +Search the web for the document called +vm/overcommit-accounting. Discuss the pros and cons of +the three possible overcommit handling modes. + +») + +HOMEWORK(« + +Read this +blog +posting on the virtual memory overcommit issue. Explain the +catch-22 situation described there in no more than two sentences. + +») + +HOMEWORK(« + +Describe, in a single paragraph of text, what a virtual dynamic +shared object (VDSO) is and which type of applications benefit most +from it. + +») + +HOMEWORK(« + +Describe the concept of huge pages and the Linux-specific +implementation of transparent huge pages. Discuss the pros +and cons of huge tables and explain the workloads which would benefit +from a setup with huge pages enabled. + +») + +HOMEWORK(« +
    +
  • Explain the concept of address space layout randomization + (ASLR).
  • + +
  • Run bash -c 'cat /proc/$$/maps' + repeatedly to see address space layout randomization in action. Discuss + the pros and cons of ASLR.
  • +
+») + +SUPPLEMENTS() + +SUBSECTION(«cd_script») + +
+	#!/bin/sh
+	echo "changing CWD to $1"
+	cd "$1"
+
+ +SUBSECTION(«hello_world») + +
+	#!/bin/sh
+	echo "hello world"
+
+ +SUBSECTION(«symlink_madness») + +
+	#!/bin/sh
+	mkdir foo
+	touch foo/a
+	ln -s ../foo foo/testdir
+	ls -l foo/a foo/testdir/a foo/testdir/testdir/a
+
+ +SECTION(«Further Reading») + diff --git a/include/css/aple.css b/include/css/aple.css new file mode 100644 index 0000000..6ebecac --- /dev/null +++ b/include/css/aple.css @@ -0,0 +1,96 @@ +body { + background-color: #aacccc; + text-align: justify; + padding: 0px 30px 0px 25px; + +} + +h1 { + font-size: 150%; +} + +.logo { + border: 0px; + padding: 0em 20px 0px 0px; + margin-left: 0px; + margin-right: 0px; + border: 0px; +} + +table { + margin-left: auto; + margin-right: auto; + border: none; +} + +td { + border: 2px #9bb solid; + padding: 5px; +} + +svg { + float: left; +} + +#menu +{ + position: fixed; + left: -40%; + width: 40%; + top: 0%; + height: 100%; + background: #ccc; + margin: 0px 0px 0px 15px; + padding: 200% 0px 0px 0px; + transition: 0.5s; +} + +#menu:hover { + top: 0em; + left: 0em; + margin: 0em 0em 0em 0em; + padding: 0em 15px 0em 15px; + background: #aaa; + overflow: auto; +} + +#title +{ + vertical-align: middle; + border: 0px; +} + +#overview_heading { + text-align: center; + font-weight: bold; +} + +#overview_text { + margin: 1% 5% 1% 5%; + font-size: 95%; +} + +div.solution { + margin: 0% 3% 0% 3%; + font-size: 95%; +} +div.diffctx { + font-family: monospace; + color: #000; +} +div.diffadd { + font-family: monospace; + color: #085; +} +div.diffdel { + font-family: monospace; + color: #a08; +} + +dt { + text-decoration: underline; +} + +#chapter_list { + font-size: 120%; +} diff --git a/include/imgs/aple.svg b/include/imgs/aple.svg new file mode 100644 index 0000000..989f88d --- /dev/null +++ b/include/imgs/aple.svg @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/include/m4/aple.m4 b/include/m4/aple.m4 new file mode 100644 index 0000000..2fc7af9 --- /dev/null +++ b/include/m4/aple.m4 @@ -0,0 +1,107 @@ +changequote(`«', `»')dnl +dnl Discard output by diverting to a negative diversion to suppress +dnl unwanted trailing newlines after macro definitions. +divert(«-1») + +define(«LOCAL_LINK_NAME», «translit(«$1», «A-Z + +», «a-z__»)») +define(«REMOVE_NEWLINE», «translit(«$1»,« +», « »)») + +define(«REFERENCE», [«REMOVE_NEWLINE(«$2»)»](««#»»«LOCAL_LINK_NAME(«$1»)»)) +define(«XREFERENCE», «[$2]»«($1)») +define(«EMPH», ««_»REMOVE_NEWLINE(«$1»)«_»») +define(«CMD», «`REMOVE_NEWLINE(«$1»)`») +define(«SECTION», « +divert(«0») +
  • $1
  • +divert(«2») + +») +define(«SUBSECTION», « +») + +define(«OVERVIEW», « + +
    +ifelse(«$2», «», «Overview», «$2») +
    +
    +$1 +
    +») +define(«EXERCISES», «

    Exercises

    ») + +define(«HOMEWORK», « +

    Homework

    +ifelse(«$2», «», «$1», «dnl +
    +$1 +

    Solution

    +
    $2
    ») +») +define(«SUPPLEMENTS», «SECTION(«Supplements»)») + +define(«TOC_ENTRY», « + ifelse(«$1.m4», «$2», «», «») + translit(«$1», «_», « ») + ifelse(«$1.m4», «$2», «», «») +
    +») +define(«LOGO», «logo») + +define(«HEADER», « + + + + + Unix course + + + + + + + + + +») + +define(«TITLE», « +HEADER(«$1», «$2», «$3») +
    + +divert(«2») +») +divert(«0»)dnl -- 2.30.2