Lösung zur Hausaufgabe RNA-Seq

Wir laden zunächst die Daten und werfen einen Blick in die linke obere Ecke der Tabelle:

library( tidyverse ) 
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
read_tsv( "data/evodevo_mouse_counts.tsv" ) -> mouse_counts
Rows: 36141 Columns: 317
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr   (1): gene_id
dbl (316): Brain.e10.5.1, Brain.e10.5.2, Brain.e10.5.3, Brain.e11.5.1, Brain...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Linke obere Ecke der Matrix:
mouse_counts[1:5, 1:5]
# A tibble: 5 × 5
  gene_id            Brain.e10.5.1 Brain.e10.5.2 Brain.e10.5.3 Brain.e11.5.1
  <chr>                      <dbl>         <dbl>         <dbl>         <dbl>
1 ENSMUSG00000000001          9532          7451          7631          8833
2 ENSMUSG00000000003             0             0             0             0
3 ENSMUSG00000000028          2374          2299          2112          1724
4 ENSMUSG00000000037           322           282           263           328
5 ENSMUSG00000000049             0             0             0             0

Nun verwenden wir den Code aus der Angabe, um die Probentabelle zu erzeugen:

tibble( sample = colnames(mouse_counts)[-1] ) %>%
separate( sample, c( "organ", "timepoint", "replicate" ), sep="\\.(?!5\\.)", remove=FALSE ) %>%
mutate( timepoint = fct_inorder(timepoint) ) -> sample_table

sample_table
# A tibble: 316 × 4
   sample        organ timepoint replicate
   <chr>         <chr> <fct>     <chr>    
 1 Brain.e10.5.1 Brain e10.5     1        
 2 Brain.e10.5.2 Brain e10.5     2        
 3 Brain.e10.5.3 Brain e10.5     3        
 4 Brain.e11.5.1 Brain e11.5     1        
 5 Brain.e11.5.2 Brain e11.5     2        
 6 Brain.e11.5.3 Brain e11.5     3        
 7 Brain.e11.5.4 Brain e11.5     4        
 8 Brain.e12.5.1 Brain e12.5     1        
 9 Brain.e12.5.2 Brain e12.5     2        
10 Brain.e12.5.3 Brain e12.5     3        
# ℹ 306 more rows

Schritt 1: Pivotieren

mouse_counts %>%
pivot_longer( -gene_id, names_to = "sample", values_to = "count" ) %>%
head()   # head: Zeige nur die ersten 10 Zeilen
# A tibble: 6 × 3
  gene_id            sample        count
  <chr>              <chr>         <dbl>
1 ENSMUSG00000000001 Brain.e10.5.1  9532
2 ENSMUSG00000000001 Brain.e10.5.2  7451
3 ENSMUSG00000000001 Brain.e10.5.3  7631
4 ENSMUSG00000000001 Brain.e11.5.1  8833
5 ENSMUSG00000000001 Brain.e11.5.2 11079
6 ENSMUSG00000000001 Brain.e11.5.3 11377

Schritt 2: Normalisieren

mouse_counts %>%
pivot_longer( -gene_id, names_to = "sample", values_to = "count" ) %>%
group_by( sample ) %>% 
mutate( cpm = count / sum(count) * 1e6 ) -> long_tbl

head( long_tbl)
# A tibble: 6 × 4
# Groups:   sample [6]
  gene_id            sample        count   cpm
  <chr>              <chr>         <dbl> <dbl>
1 ENSMUSG00000000001 Brain.e10.5.1  9532  332.
2 ENSMUSG00000000001 Brain.e10.5.2  7451  263.
3 ENSMUSG00000000001 Brain.e10.5.3  7631  272.
4 ENSMUSG00000000001 Brain.e11.5.1  8833  341.
5 ENSMUSG00000000001 Brain.e11.5.2 11079  376.
6 ENSMUSG00000000001 Brain.e11.5.3 11377  372.

Schritt 3: Table-Joins

long_tbl %>%
left_join( sample_table ) %>%
left_join( read_tsv( "data/Ensembl_102_GRCm38.p6_Gene_names_2.tsv" ), by="gene_id" ) -> long_tbl

head( long_tbl )
# A tibble: 6 × 9
# Groups:   sample [6]
  gene_id            sample      count   cpm organ timepoint replicate gene_name
  <chr>              <chr>       <dbl> <dbl> <chr> <fct>     <chr>     <chr>    
1 ENSMUSG00000000001 Brain.e10.…  9532  332. Brain e10.5     1         Gnai3    
2 ENSMUSG00000000001 Brain.e10.…  7451  263. Brain e10.5     2         Gnai3    
3 ENSMUSG00000000001 Brain.e10.…  7631  272. Brain e10.5     3         Gnai3    
4 ENSMUSG00000000001 Brain.e11.…  8833  341. Brain e11.5     1         Gnai3    
5 ENSMUSG00000000001 Brain.e11.… 11079  376. Brain e11.5     2         Gnai3    
6 ENSMUSG00000000001 Brain.e11.… 11377  372. Brain e11.5     3         Gnai3    
# ℹ 1 more variable: gene_description <chr>

Schritt 4: Plotten

Hier ist der Code für Gen “Alb”:

gene <- "Alb"

long_tbl %>%
filter( gene_name == gene ) %>%
ggplot() +
  geom_point( aes( x=timepoint, y=cpm ), size=.3 ) +
  facet_wrap( ~ organ ) +
  scale_y_log10() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  ggtitle( gene )

Wir können den Code in eine Funktion packen und dann diese dann für alle Gene aufrufen:

plot_gene <- function(gene) {
  
  long_tbl %>%
  filter( gene_name == gene ) %>%
  ggplot() +
    geom_point( aes( x=timepoint, y=cpm ), size=.3 ) +
    facet_wrap( ~ organ ) +
    scale_y_log10() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
    ggtitle( gene )
}

plot_gene( "Hbb-bh3" )
Warning in scale_y_log10(): log-10 transformation introduced infinite values.

plot_gene( "Mbp" )

plot_gene( "Islr2" )
Warning in scale_y_log10(): log-10 transformation introduced infinite values.

plot_gene( "Gpt" )