dplyrgetting familiar with transforming data with dplyr
tidyverse package and explore the starwars dataset.library(tidyverse)
glimpse(starwars)
## Observations: 87
## Variables: 13
## $ name <chr> "Luke Skywalker", "C-3PO", "R2-D2", "Darth Vader", "L…
## $ height <int> 172, 167, 96, 202, 150, 178, 165, 97, 183, 182, 188, …
## $ mass <dbl> 77.0, 75.0, 32.0, 136.0, 49.0, 120.0, 75.0, 32.0, 84.…
## $ hair_color <chr> "blond", NA, NA, "none", "brown", "brown, grey", "bro…
## $ skin_color <chr> "fair", "gold", "white, blue", "white", "light", "lig…
## $ eye_color <chr> "blue", "yellow", "red", "yellow", "brown", "blue", "…
## $ birth_year <dbl> 19.0, 112.0, 33.0, 41.9, 19.0, 52.0, 47.0, NA, 24.0, …
## $ gender <chr> "male", NA, NA, "male", "female", "male", "female", N…
## $ homeworld <chr> "Tatooine", "Tatooine", "Naboo", "Tatooine", "Alderaa…
## $ species <chr> "Human", "Droid", "Droid", "Human", "Human", "Human",…
## $ films <list> [<"Revenge of the Sith", "Return of the Jedi", "The …
## $ vehicles <list> [<"Snowspeeder", "Imperial Speeder Bike">, <>, <>, <…
## $ starships <list> [<"X-wing", "Imperial shuttle">, <>, <>, "TIE Advanc…
homeworldcount(starwars, homeworld)
## # A tibble: 49 x 2
## homeworld n
## <chr> <int>
## 1 Alderaan 3
## 2 Aleen Minor 1
## 3 Bespin 1
## 4 Bestine IV 1
## 5 Cato Neimoidia 1
## 6 Cerea 1
## 7 Champala 1
## 8 Chandrila 1
## 9 Concord Dawn 1
## 10 Corellia 2
## # … with 39 more rows
#starwars %>%
# group_by(homeworld) %>%
# summarise(n = n())
count(starwars, homeworld) %>%
arrange(desc(n))
## # A tibble: 49 x 2
## homeworld n
## <chr> <int>
## 1 Naboo 11
## 2 Tatooine 10
## 3 <NA> 10
## 4 Alderaan 3
## 5 Coruscant 3
## 6 Kamino 3
## 7 Corellia 2
## 8 Kashyyyk 2
## 9 Mirial 2
## 10 Ryloth 2
## # … with 39 more rows
filter(starwars, is.na(homeworld))
## # A tibble: 10 x 13
## name height mass hair_color skin_color eye_color birth_year gender
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 Yoda 66 17 white green brown 896 male
## 2 IG-88 200 140 none metal red 15 none
## 3 Arve… NA NA brown fair brown NA male
## 4 Qui-… 193 89 brown fair blue 92 male
## 5 R4-P… 96 NA none silver, r… red, blue NA female
## 6 Finn NA NA black dark dark NA male
## 7 Rey NA NA brown light hazel NA female
## 8 Poe … NA NA brown light brown NA male
## 9 BB8 NA NA none none black NA none
## 10 Capt… NA NA unknown unknown unknown NA female
## # … with 5 more variables: homeworld <chr>, species <chr>, films <list>,
## # vehicles <list>, starships <list>
homeworld and speciescount(starwars, homeworld, species) %>%
arrange(desc(n))
## # A tibble: 58 x 3
## homeworld species n
## <chr> <chr> <int>
## 1 Tatooine Human 8
## 2 Naboo Human 5
## 3 <NA> Human 5
## 4 Alderaan Human 3
## 5 Naboo Gungan 3
## 6 Corellia Human 2
## 7 Coruscant Human 2
## 8 Kamino Kaminoan 2
## 9 Kashyyyk Wookiee 2
## 10 Mirial Mirialan 2
## # … with 48 more rows
specieshomeworldcount(starwars, homeworld, species) %>%
arrange(desc(n)) %>%
filter(n > 1, !is.na(species), !is.na(homeworld))
## # A tibble: 11 x 3
## homeworld species n
## <chr> <chr> <int>
## 1 Tatooine Human 8
## 2 Naboo Human 5
## 3 Alderaan Human 3
## 4 Naboo Gungan 3
## 5 Corellia Human 2
## 6 Coruscant Human 2
## 7 Kamino Kaminoan 2
## 8 Kashyyyk Wookiee 2
## 9 Mirial Mirialan 2
## 10 Ryloth Twi'lek 2
## 11 Tatooine Droid 2
homeworld, then 7 columns of speciescount(starwars, homeworld, species) %>%
arrange(desc(n)) %>%
filter(n > 1, !is.na(species), !is.na(homeworld)) %>%
pivot_wider(id_cols = homeworld,
names_from = species,
values_from = n)
## # A tibble: 9 x 8
## homeworld Human Gungan Kaminoan Wookiee Mirialan `Twi'lek` Droid
## <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 Tatooine 8 NA NA NA NA NA 2
## 2 Naboo 5 3 NA NA NA NA NA
## 3 Alderaan 3 NA NA NA NA NA NA
## 4 Corellia 2 NA NA NA NA NA NA
## 5 Coruscant 2 NA NA NA NA NA NA
## 6 Kamino NA NA 2 NA NA NA NA
## 7 Kashyyyk NA NA NA 2 NA NA NA
## 8 Mirial NA NA NA NA 2 NA NA
## 9 Ryloth NA NA NA NA NA 2 NA
NA values by 0.You can find at least 3 solutions:
pivot_wider functiontidyr helper on the long format datadplyr function once in wide formatcount(starwars, homeworld, species) %>%
arrange(desc(n)) %>%
filter(n > 1, !is.na(species), !is.na(homeworld)) %>%
pivot_wider(id_cols = homeworld,
names_from = species,
values_from = n,
values_fill = list(n = 0))
## # A tibble: 9 x 8
## homeworld Human Gungan Kaminoan Wookiee Mirialan `Twi'lek` Droid
## <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 Tatooine 8 0 0 0 0 0 2
## 2 Naboo 5 3 0 0 0 0 0
## 3 Alderaan 3 0 0 0 0 0 0
## 4 Corellia 2 0 0 0 0 0 0
## 5 Coruscant 2 0 0 0 0 0 0
## 6 Kamino 0 0 2 0 0 0 0
## 7 Kashyyyk 0 0 0 2 0 0 0
## 8 Mirial 0 0 0 0 2 0 0
## 9 Ryloth 0 0 0 0 0 2 0
count(starwars, homeworld, species) %>%
arrange(desc(n)) %>%
filter(n > 1, !is.na(species), !is.na(homeworld)) %>%
# using tidyr::complete()
complete(homeworld, species, fill = list(n = 0)) %>%
pivot_wider(id_cols = homeworld,
names_from = species,
values_from = n)
## # A tibble: 9 x 8
## homeworld Droid Gungan Human Kaminoan Mirialan `Twi'lek` Wookiee
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Alderaan 0 0 3 0 0 0 0
## 2 Corellia 0 0 2 0 0 0 0
## 3 Coruscant 0 0 2 0 0 0 0
## 4 Kamino 0 0 0 2 0 0 0
## 5 Kashyyyk 0 0 0 0 0 0 2
## 6 Mirial 0 0 0 0 2 0 0
## 7 Naboo 0 3 5 0 0 0 0
## 8 Ryloth 0 0 0 0 0 2 0
## 9 Tatooine 2 0 8 0 0 0 0
count(starwars, homeworld, species) %>%
arrange(desc(n)) %>%
filter(n > 1, !is.na(species), !is.na(homeworld)) %>%
pivot_wider(id_cols = homeworld,
names_from = species,
values_from = n) %>%
# using dplyr::mutate_if.
# for only columns of integers, apply if_else statement
# - is na -> change for 0
# - is not na -> get same value
#mutate_if(is.integer, list(~ if_else(is.na(.), 0L, .)))
mutate_at(vars(-homeworld), list(~ if_else(is.na(.), 0L, .)))
## # A tibble: 9 x 8
## homeworld Human Gungan Kaminoan Wookiee Mirialan `Twi'lek` Droid
## <chr> <int> <int> <int> <int> <int> <int> <int>
## 1 Tatooine 8 0 0 0 0 0 2
## 2 Naboo 5 3 0 0 0 0 0
## 3 Alderaan 3 0 0 0 0 0 0
## 4 Corellia 2 0 0 0 0 0 0
## 5 Coruscant 2 0 0 0 0 0 0
## 6 Kamino 0 0 2 0 0 0 0
## 7 Kashyyyk 0 0 0 2 0 0 0
## 8 Mirial 0 0 0 0 2 0 0
## 9 Ryloth 0 0 0 0 0 2 0
height and mass per species. Add a column n to know from how many individuals the values were computed. Filter out n values smaller than 2.starwars %>%
filter(!is.na(height), !is.na(mass)) %>%
group_by(species) %>%
mutate(n = n()) %>%
filter(n >= 2) %>%
summarise(height_mean = mean(height),
height_med = median(height),
mass_mean = mean(mass),
mass_med = median(mass),
n = n())
## # A tibble: 5 x 6
## species height_mean height_med mass_mean mass_med n
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 Droid 140 132 69.8 53.5 4
## 2 Gungan 210 210 74 74 2
## 3 Human 180. 181 82.8 79 22
## 4 Mirialan 168 168 53.1 53.1 2
## 5 Wookiee 231 231 124 124 2
height.mass.sd / sqrt(n)) for heightmassdiff_med_meanstarwars %>%
filter(!is.na(height), !is.na(mass)) %>%
group_by(species) %>%
mutate(n = n()) %>%
filter(n >= 2) %>%
summarise(height_mean = mean(height),
height_med = median(height),
mass_mean = mean(mass),
mass_med = median(mass),
height_sd = sd(height),
mass_sd = sd(mass),
n = n()) %>%
mutate(diff_height = height_mean - height_med,
diff_mass = mass_mean - mass_med,
sem_height = height_sd / sqrt(n),
sem_mass = mass_sd / sqrt(n)) %>%
select(species, starts_with("diff"), starts_with("sem")) -> diff_med_mean
height or mass.filter(diff_med_mean, abs(diff_height) > sem_height | abs(diff_mass) > sem_mass)
## # A tibble: 0 x 5
## # … with 5 variables: species <chr>, diff_height <dbl>, diff_mass <dbl>,
## # sem_height <dbl>, sem_mass <dbl>
biomaRtIf you are missing biomaRt, install it with:
# install if missing # install.packages("BiocManager") BiocManager::install("biomaRt")
biomaRt.library(biomaRt)
gene_mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL",
host = "www.ensembl.org")
gene_set <- useDataset(gene_mart , dataset = "hsapiens_gene_ensembl")
gene_by_exon <- as_tibble(getBM(
mart = gene_set,
attributes = c(
"ensembl_gene_id",
"ensembl_transcript_id",
"ensembl_exon_id",
"chromosome_name",
"start_position",
"end_position",
"hgnc_symbol",
"hgnc_id",
"strand",
"gene_biotype",
"phenotype_description"
),
filter = "chromosome_name",
value = "21"
))
#write_csv(gene_by_exon, here::here("data", "gene_by_exon.csv"))
genes_by_exon data set.genes_by_exon data set to a tibble.glimpse() to find tin which column this info is storeddistinct() on this column to identify how pseudogenes are coded.pseudogenes.stringr, see the function str_detect() that look for the presence of a substring and return a logical vector. In combination with filter() you should be able to extract all “pseudogene” genes
gene_by_exon <- as_tibble(gene_by_exon)
pseudogenes <- filter(gene_by_exon, str_detect(gene_biotype, "pseudogene"))
table())count(pseudogenes, gene_biotype)
## # A tibble: 6 x 2
## gene_biotype n
## <chr> <int>
## 1 processed_pseudogene 167
## 2 pseudogene 2
## 3 rRNA_pseudogene 5
## 4 transcribed_processed_pseudogene 18
## 5 transcribed_unprocessed_pseudogene 194
## 6 unprocessed_pseudogene 142
count(pseudogenes, ensembl_gene_id)
## # A tibble: 188 x 2
## ensembl_gene_id n
## <chr> <int>
## 1 ENSG00000168122 3
## 2 ENSG00000173231 1
## 3 ENSG00000175302 18
## 4 ENSG00000176054 1
## 5 ENSG00000179381 1
## 6 ENSG00000183249 4
## 7 ENSG00000185390 2
## 8 ENSG00000187172 28
## 9 ENSG00000188681 10
## 10 ENSG00000189089 1
## # … with 178 more rows
# 188, even pseudogenes have multiple transcripts
gene_by_exon %>%
mutate(length = end_position - start_position) %>%
arrange(length)
## # A tibble: 21,518 x 12
## ensembl_gene_id ensembl_transcr… ensembl_exon_id chromosome_name
## <chr> <chr> <chr> <dbl>
## 1 ENSG00000266692 ENST00000581669 ENSE00002724833 21
## 2 ENSG00000275523 ENST00000611656 ENSE00003745719 21
## 3 ENSG00000277437 ENST00000614492 ENSE00003754026 21
## 4 ENSG00000264063 ENST00000577708 ENSE00002724720 21
## 5 ENSG00000275167 ENST00000611994 ENSE00003717901 21
## 6 ENSG00000283904 ENST00000385060 ENSE00001500066 21
## 7 ENSG00000284448 ENST00000290239 ENSE00003729123 21
## 8 ENSG00000275166 ENST00000622292 ENSE00003738277 21
## 9 ENSG00000201025 ENST00000364155 ENSE00001438918 21
## 10 ENSG00000263681 ENST00000582241 ENSE00002698395 21
## # … with 21,508 more rows, and 8 more variables: start_position <dbl>,
## # end_position <dbl>, hgnc_symbol <chr>, hgnc_id <chr>, strand <dbl>,
## # gene_biotype <chr>, phenotype_description <chr>, length <dbl>
gene_biotype.gene_by_exon %>%
mutate(length = end_position - start_position) %>%
group_by(gene_biotype) %>%
summarise(mean_length = mean(length))
## # A tibble: 21 x 2
## gene_biotype mean_length
## <chr> <dbl>
## 1 3prime_overlapping_ncRNA 3643
## 2 antisense 34627.
## 3 bidirectional_promoter_lncRNA 9756
## 4 IG_V_gene 435
## 5 lincRNA 109333.
## 6 miRNA 86.6
## 7 misc_RNA 145.
## 8 processed_pseudogene 1007.
## 9 processed_transcript 73037.
## 10 protein_coding 141824.
## # … with 11 more rows
gene_biotype.gene_by_exon %>%
mutate(length = end_position - start_position) %>%
group_by(gene_biotype) %>%
summarize(mean_length = mean(length),
n_gene = n_distinct(ensembl_gene_id),
n_trans = n_distinct(ensembl_transcript_id),
n_exon = n_distinct(ensembl_exon_id),
n = n())
## # A tibble: 21 x 6
## gene_biotype mean_length n_gene n_trans n_exon n
## <chr> <dbl> <int> <int> <int> <int>
## 1 3prime_overlapping_ncRNA 3643 1 1 4 4
## 2 antisense 34627. 79 154 347 399
## 3 bidirectional_promoter_lncRNA 9756 1 4 10 12
## 4 IG_V_gene 435 1 1 2 2
## 5 lincRNA 109333. 192 380 1003 1258
## 6 miRNA 86.6 29 29 29 29
## 7 misc_RNA 145. 24 24 24 24
## 8 processed_pseudogene 1007. 138 138 167 167
## 9 processed_transcript 73037. 7 38 121 141
## 10 protein_coding 141824. 231 1530 5693 18986
## # … with 11 more rows
gene_by_exon %>%
filter(gene_biotype == "bidirectional_promoter_lncRNA") %>%
arrange(ensembl_exon_id)
## # A tibble: 12 x 11
## ensembl_gene_id ensembl_transcr… ensembl_exon_id chromosome_name
## <chr> <chr> <chr> <dbl>
## 1 ENSG00000223768 ENST00000647108 ENSE00001542583 21
## 2 ENSG00000223768 ENST00000454115 ENSE00001542586 21
## 3 ENSG00000223768 ENST00000647108 ENSE00001655745 21
## 4 ENSG00000223768 ENST00000454115 ENSE00001668643 21
## 5 ENSG00000223768 ENST00000647108 ENSE00001697127 21
## 6 ENSG00000223768 ENST00000400362 ENSE00001714446 21
## 7 ENSG00000223768 ENST00000454115 ENSE00001714446 21
## 8 ENSG00000223768 ENST00000647108 ENSE00001714446 21
## 9 ENSG00000223768 ENST00000400362 ENSE00001747474 21
## 10 ENSG00000223768 ENST00000400362 ENSE00003821463 21
## 11 ENSG00000223768 ENST00000433465 ENSE00003823847 21
## 12 ENSG00000223768 ENST00000433465 ENSE00003829032 21
## # … with 7 more variables: start_position <dbl>, end_position <dbl>,
## # hgnc_symbol <chr>, hgnc_id <chr>, strand <dbl>, gene_biotype <chr>,
## # phenotype_description <chr>
separate_rows(), and count in a single pipe workflowtolower()
gene_by_exon %>%
#distinct(phenotype_description) %>%
select(phenotype_description) %>%
filter(!is.na(phenotype_description)) %>%
separate_rows(phenotype_description, sep = " ") %>%
mutate(phenotype_description = tolower(phenotype_description)) %>%
count(phenotype_description) %>%
arrange(desc(n))
## # A tibble: 229 x 2
## phenotype_description n
## <chr> <int>
## 1 type 2169
## 2 syndrome 1831
## 3 autosomal 1159
## 4 to 1119
## 5 amyloidosis 1044
## 6 deficiency 1037
## 7 due 772
## 8 1 755
## 9 early-onset 735
## 10 abeta 696
## # … with 219 more rows
# type with 2169 occurences
tidyrchol are mapped as values in columns visit. Note that for 1L L only specifies integers.chol_by_visit <- tribble(
~sampleid, ~visit, ~chol,
"S1", 1L, 120.0,
"S1", 2L, 178,
"S2", 1L, 180,
"S2", 2L, 221,
"S2", 3L, 240,
"S3", 1L, 122,
"S3", 2L, 160,
"S3", 3L, 154
)
The result should look like this:
| sampleid | 1 | 2 | 3 |
|---|---|---|---|
| S1 | 120 | 178 | NA |
pivot_wider(chol_by_visit,
id_cols = sampleid,
names_from = visit,
values_from = chol)
## # A tibble: 3 x 4
## sampleid `1` `2` `3`
## <chr> <dbl> <dbl> <dbl>
## 1 S1 120 178 NA
## 2 S2 180 221 240
## 3 S3 122 160 154
pivot_wider had revealed all combinations between visit and sampleid. And from the raw data we see that patient S1 did not attend visit 3.
variants <- tribble(
~sampleid, ~var1, ~var2, ~var3,
"S1", "A3T", "T5G", "T6G",
"S2", "A3G", "T5G", NA,
"S3", "A3T", "T6C", "G10C",
"S4", "A3T", "T6C", "G10C"
)
biomaRt also has a function called select(). If it was loaded after dplyr then use dplyr::select() syntax to specify the appropriate one
the var1, 2 or 3 are build the same way:
you can once in the long format, split each of the 3 informations into its own column using separate(x, y, sep = 1:3) where x is the column of mutations (3 characters merged) and y a character vector containing the 3 column names of your choice. Like c("ref", "pos", "derived").
# not tidy, var1, 2, 3 should be one column
variants %>%
pivot_longer(cols = -sampleid,
names_to = "varcol",
values_to = "mutation") %>%
separate(mutation, into = c("ref", "pos", "derived"), sep = 1:3) %>%
dplyr::select(sampleid, ref, pos, derived) %>%
filter(!is.na(derived)) %>%
pivot_wider(names_from = c(ref, pos),
values_from = derived)
## # A tibble: 4 x 5
## sampleid A_3 T_5 T_6 G_1
## <chr> <chr> <chr> <chr> <chr>
## 1 S1 T G G <NA>
## 2 S2 G G <NA> <NA>
## 3 S3 T <NA> C 0
## 4 S4 T <NA> C 0
# using regular expressions with stringr::str_extract
variants %>%
pivot_longer(cols = -sampleid,
names_to = "varcol",
values_to = "mutation") %>%
# \\d = [0-9] and a '+' otherwise on 10 only 1 is extracted. + means up to the non number character
mutate(pos = str_extract(mutation, "\\d+"), #
# hat ('^') means begining of the line and followed by either A, C, T or G and only one of each
ref = str_extract(mutation, "^[ACTG]"),
# dollar ('$') means end of the line and precedesd by either A, C, T or G and only one of each
der = str_extract(mutation, "[ACTG]$"))
## # A tibble: 12 x 6
## sampleid varcol mutation pos ref der
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 S1 var1 A3T 3 A T
## 2 S1 var2 T5G 5 T G
## 3 S1 var3 T6G 6 T G
## 4 S2 var1 A3G 3 A G
## 5 S2 var2 T5G 5 T G
## 6 S2 var3 <NA> <NA> <NA> <NA>
## 7 S3 var1 A3T 3 A T
## 8 S3 var2 T6C 6 T C
## 9 S3 var3 G10C 10 G C
## 10 S4 var1 A3T 3 A T
## 11 S4 var2 T6C 6 T C
## 12 S4 var3 G10C 10 G C
variant_significance <- tribble(
~variant, ~significance,
"A3T", "U",
"A3G", "D",
"T5G", "B",
"T6G", "D",
"T6C", "B",
"G10C", "U"
)
variants %>%
pivot_longer(cols = -sampleid,
names_to = "varcol",
values_to = "mutation") %>%
inner_join(variant_significance, by = c(mutation = "variant")) %>%
filter(significance == "D")
## # A tibble: 2 x 4
## sampleid varcol mutation significance
## <chr> <chr> <chr> <chr>
## 1 S1 var3 T6G D
## 2 S2 var1 A3G D
variants %>%
pivot_longer(cols = -sampleid,
names_to = "varcol",
values_to = "mutation") %>%
semi_join(filter(variant_significance, significance == "D"),
by = c(mutation = "variant"))
## # A tibble: 2 x 3
## sampleid varcol mutation
## <chr> <chr> <chr>
## 1 S1 var3 T6G
## 2 S2 var1 A3G