Introduction to the AnVIL package

17 August 2023 · R

if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager", repos = "https://cran.r-project.org")
BiocManager::install("AnVIL")
library(AnVIL)
dir(file.path(Sys.getenv("GCLOUD_SDK_PATH"), "bin"), "^(gcloud|gsutil)$")
## [1] "gcloud" "gsutil"
## the code chunks in this vignette are fully evaluated when
## gcloud_exists() returns TRUE
gcloud_exists()
## [1] FALSE
AnVIL::install("GenomicFeatures")
add_libpaths("~/my/project")
gcloud_account() # authentication account
gcloud_project() # billing project information
gcloud_cmd("projects", "list") %>%
    readr::read_table() %>%
    filter(startsWith(PROJECT_ID, "anvil"))
gcloud_help("projects")
src <- "gs://genomics-public-data/1000-genomes/"
gsutil_ls(src)

other <- paste0(src, "other")
gsutil_ls(other, recursive = TRUE)

sample_info <- paste0(src, "other/sample_info/sample_info.csv")
gsutil_stat(sample_info)
fl <- tempfile()
gsutil_cp(sample_info, fl)

csv <- readr::read_csv(fl, guess_max = 5000L, col_types = readr::cols())
csv
pipe <- gsutil_pipe(fl, "rb")
readr::read_csv(pipe, guess_max = 5000L, col_types = readr::cols()) %>%
    dplyr::select("Sample", "Family_ID", "Population", "Gender")
destination <- tempfile()
stopifnot(dir.create(destination))
source <- paste0(src, "other/sample_info")

## dry run
gsutil_rsync(source, destination)

gsutil_rsync(source, destination, dry = FALSE)
dir(destination, recursive = TRUE)

## nothing to synchronize
gsutil_rsync(source, destination, dry = FALSE)

## one file requires synchronization
unlink(file.path(destination, "README"))
gsutil_rsync(source, destination, dry = FALSE)
avworkspace_namespace()
avworkspace_name()
## N.B.: IT MAY NOT BE NECESSARY TO SET THESE WHEN ON ANVIL
avworkspace_namespace("pathogen-genomic-surveillance")
avworkspace_name("COVID-19")
avtables()
sample <- avtable("sample")
sample
sample %>%
    select("sample_id", contains("fasta")) %>%
    filter(!is.na(final_assembly_fasta))
my_cars <-
    mtcars |>
    as_tibble(rownames = "model") |>
    mutate(model = gsub(" ", "_", model))
job_status <- avtable_import(my_cars)
avtable_import_status(job_status)
(job_status <- avtable_import(my_cars, pageSize = 10))
## pageSize = 10 rows (4 pages)
##   |======================================================================| 100%
## # A tibble: 4 × 5
##    page from_row to_row job_id                               status
##   <int>    <int>  <int> <chr>                                <chr>
## 1     1        1     10 a32e9706-f63c-49ed-9620-b214746b9392 Uploaded
## 2     2       11     20 f2910ac2-0954-4fb9-b36c-970845a266b7 Uploaded
## 3     3       21     30 e18adc5b-d26f-4a8a-a0d7-a232e17ac8d2 Uploaded
## 4     4       31     32 d14efb89-e2dd-4937-b80a-169520b5f563 Uploaded
(job_status <- avtable_import_status(job_status))
## checking status of 4 avtable import jobs
##   |======================================================================| 100%
## # A tibble: 4 × 5
##    page from_row to_row job_id                               status
##   <int>    <int>  <int> <chr>                                <chr>
## 1     1        1     10 a32e9706-f63c-49ed-9620-b214746b9392 Done
## 2     2       11     20 f2910ac2-0954-4fb9-b36c-970845a266b7 Done
## 3     3       21     30 e18adc5b-d26f-4a8a-a0d7-a232e17ac8d2 ReadyForUpsert
## 4     4       31     32 d14efb89-e2dd-4937-b80a-169520b5f563 ReadyForUpsert
(job_status <- avtable_import_status(job_status))
## checking status of 4 avtable import jobs
##   |======================================================================| 100%
## # A tibble: 4 × 5
##    page from_row to_row job_id                               status
##   <int>    <int>  <int> <chr>                                <chr>
## 1     1        1     10 a32e9706-f63c-49ed-9620-b214746b9392 Done
## 2     2       11     20 f2910ac2-0954-4fb9-b36c-970845a266b7 Done
## 3     3       21     30 e18adc5b-d26f-4a8a-a0d7-a232e17ac8d2 Done
## 4     4       31     32 d14efb89-e2dd-4937-b80a-169520b5f563 Done
## editable copy of '1000G-high-coverage-2019' workspace
avworkspace("bioconductor-rpci-anvil/1000G-high-coverage-2019")
sample <-
    avtable("sample") %>%                               # existing table
    mutate(set = sample(head(LETTERS), nrow(.), TRUE))  # arbitrary groups
sample %>%                                   # new 'participant_set' table
    avtable_import_set("participant", "set", "participant")
sample %>%                                   # new 'sample_set' table
    avtable_import_set("sample", "set", "name")
avdata()
bucket <- avbucket()
bucket
avfiles_ls()
## requires workspace ownership
uri <- avbucket()                             # discover bucket
bucket <- file.path(uri, "mtcars.tab")
write.table(mtcars, gsutil_pipe(bucket, "w")) # write to bucket
## backup all files and folders in the current working directory
avfiles_backup(getwd(), recursive = TRUE)

## backup all files in the current directory
avfiles_backup(dir())

## backup all files to gs://<avbucket()>/scratch/
avfiles_backup(dir, paste0(avbucket(), "/scratch"))
uri <- c(
    vcf = "drs://dg.ANV0/6f633518-f2de-4460-aaa4-a27ee6138ab5",
    tbi = "drs://dg.ANV0/4fb9e77f-c92a-4deb-ac90-db007dc633aa"
)
tbl <- drs_stat(uri)
## # A tibble: 2 × 9
##   drs      fileName   size gsUri accessUrl timeUpdated hashes       bucket name
##   <chr>    <chr>     <dbl> <chr> <chr>     <chr>       <list>       <chr>  <chr>
## 1 drs://d… NA21144… 7.06e9 gs:/… NA        2020-07-08… <named list> fc-56… CCDG…
## 2 drs://d… NA21144… 4.08e6 gs:/… NA        2020-07-08… <named list> fc-56… CCDG…
drs_cp(uri, "/tmp")     # local temporary directory
drs_cp(uri, avbucket()) # workspace bucket
suppressPackageStartupMessages({
    library(VariantAnnotation)
})
https <- drs_access_url(uri)
vcffile <- VcfFile(https[["vcf"]], https[["tbi"]])
scanVcfHeader(vcffile)
## class: VCFHeader
## samples(1): NA21144
## meta(3): fileformat reference contig
## fixed(2): FILTER ALT
## info(16): BaseQRankSum ClippingRankSum ... ReadPosRankSum VariantType
## geno(11): GT AB ... PL SB

variants <- readVcf(vcffile, param = GRanges("chr1:1-1000000"))
nrow(variants)
## [1] 123077
terra <- Terra()
terra
terra %>% tags("Status")
terra$status
terra$status()
args(terra$createBillingProjectFull)
args(terra$overwriteWorkspaceMethodConfig)
status <- terra$status()
class(status)
str(status)
lst <- status %>% as.list()
lengths(lst)
lengths(lst$systems)
str(lst$systems)
> .api_test_check(Terra(), "Terra") |> lengths()
        common          added        removed        updated  common_in_use
           135             24              3             11              9
removed_in_use updated_in_use
             0              3
> .api_test_check(Terra(), "Terra")[c("removed_in_use", "updated_in_use")]
$removed_in_use
character(0)

$updated_in_use
[1] "cloneWorkspace"         "entityQuery"            "flexibleImportEntities"
Imports: AnVIL
importFrom AnVIL, Service
importMethodsFrom AnVIL, "$"   # pehaps also `tags()`, etc
importClassesFrom AnVIL, Service
.MyService <- setClass("MyService", contains = "Service")

MyService <-
    function()
{
    .MyService(Service(
        "myservice",
        host = "api.firecloud.org",
        api_url = "https://api.firecloud.org/api-docs.yaml",
        authenticate = FALSE
    ))
}
git clone https://git.bioconductor.org/packages/AnVIL

Workspace	AnVIL function
TABLES	`avtables()`
REFERENCE DATA	None
OTHER DATA	`avbucket()`
Workspace Data	`avdata()`
Files	`avfiles_ls()`, `avfiles_backup()`, `avfiles_restore()`

Introduction to the AnVIL package

17 August 2023

Abstract

Package

Contents

1 Installation

2 Quick start

2.1 Up to speed with AnVIL

2.2 Use in the AnVIL cloud

2.3 Local use

2.4 Graphical interfaces

3 For end users

3.1 Fast binary package installation

3.2 Working with Google cloud-based resources

Using `gcloud_*()` for account management

Using `gsutil_*()` for file and bucket management

3.3 Using `av*()` to work with AnVIL tables and data

Tables, reference data, and persistent files

Using `avtable*()` for accessing tables

Using `avdata()` for accessing Workspace Data

Using `avbucket()` and workspace files

3.4 Using `avnotebooks*()` for notebook management

3.5 Using `avworkflows_*()` for workflows

3.6 Using `avworkspace_*()` for workspaces

3.7 Using `drs_*()` for resolving DRS (Data Repository Service) URIs

4 For developers

4.1 Set-up

4.2 Service APIs

Construction

Invoke endpoints

Process responses

Test endpoints

4.3 Service implementations

4.4 Extending the `Service` class to implement your own RESTful interface

5 Support, bug reports, and source code availability

Appendix

Acknowledgments

Session info

Introduction to the AnVIL package

17 August 2023

Abstract

Package

Contents

1 Installation

2 Quick start

2.1 Up to speed with AnVIL

2.2 Use in the AnVIL cloud

2.3 Local use

2.4 Graphical interfaces

3 For end users

3.1 Fast binary package installation

3.2 Working with Google cloud-based resources

Using gcloud_*() for account management

Using gsutil_*() for file and bucket management

3.3 Using av*() to work with AnVIL tables and data

Tables, reference data, and persistent files

Using avtable*() for accessing tables

Using avdata() for accessing Workspace Data

Using avbucket() and workspace files

3.4 Using avnotebooks*() for notebook management

3.5 Using avworkflows_*() for workflows

3.6 Using avworkspace_*() for workspaces

3.7 Using drs_*() for resolving DRS (Data Repository Service) URIs

4 For developers

4.1 Set-up

4.2 Service APIs

Construction

Invoke endpoints

Process responses

Test endpoints

4.3 Service implementations

4.4 Extending the Service class to implement your own RESTful interface

5 Support, bug reports, and source code availability

Appendix

Acknowledgments

Session info

Using `gcloud_*()` for account management

Using `gsutil_*()` for file and bucket management

3.3 Using `av*()` to work with AnVIL tables and data

Using `avtable*()` for accessing tables

Using `avdata()` for accessing Workspace Data

Using `avbucket()` and workspace files

3.4 Using `avnotebooks*()` for notebook management

3.5 Using `avworkflows_*()` for workflows

3.6 Using `avworkspace_*()` for workspaces

3.7 Using `drs_*()` for resolving DRS (Data Repository Service) URIs

4.4 Extending the `Service` class to implement your own RESTful interface