Extending aPPR • aPPR

Suppose you want to calculate personalized PageRanks for some graph that is not supported by aPPR. You can extend aPPR to arbitrary graph objects, so long the graph object is an S3 object with methods:

See the documentation for those S3 generics to understand the generic specification fully!

We demonstrate how to implement these methods below for a new, custom graph object. In this case, we will consider the PubMed citation network, which we will interact via the PubMed API, using the rentrez package. First, we define a constructor function that returns a custom S3 graph object that subclasses abstract_graph. You can install rentrez with pak::pak("ropensci/rentrez"), and do not need to set up any authentication to begin using the API.

library(aPPR)
library(logger)
library(glue)
library(rentrez)

# constructor for PubMed graph object (defined over API)
pubmed_graph <- function(max_attempts = 3) {
  if (!requireNamespace("rentrez", quietly = TRUE)) {
    stop(
      "`rentrez` package must be installed to use `pubmed_graph()`",
      call. = FALSE
    )
  }

  agraph <- abstract_graph(
    subclass = "pubmed",
    max_attempts = max_attempts
  )

  agraph
}

graph <- pubmed_graph()
graph
#> Abstract graph object (subclass: pubmed)

Now we want to implement S3 methods for the pubmed object. In some cases, you can query data from a graph in large batches, but with the PubMed API it simpler (at least to my limited knowledge) to query node by node, with no bulk lookups. It turns out that we can get neighborhoods and node in-degree and node out-degree all at once, using rentrez::entrez_link(). We give the function three attempts (by default) to successfully complete this API call, since APIs sometimes fail. Then, since we will need this information repeatedly, we memoize the function, to avoid repeated calls to the API.

# one node at a time
get_pubmed_data <- function(graph, node) {
  for (i in 1:graph$max_attempts) {
    log_trace(
      glue("Attempt {i}/{graph$max_attempts} to get node degrees: {node}")
    )

    tryCatch(
      {
        cites <- entrez_link(dbfrom = "pubmed", db = "all", id = node)
        break
      },
      error = function(cnd) {
        if (i == graph$max_attempts) {
          log_debug(
            glue("Maximum attempts to find neighborhood met, could not find: {node}")
          )
          stop("Couldn't pull data for node")
        }
      }
    )
  }

  data <- list(
    refs = unique(cites$links$pubmed_pubmed_refs),
    citedby = unique(cites$links$pubmed_pubmed_citedin)
  )

  data$num_refs <- length(data$refs)
  data$num_citedby <- length(data$citedby)
  data
}

memo_get_pubmed_data <- memoise::memoise(get_pubmed_data)

Now we test the function. I’m not currently sure that it’s working: it’s suspicious for two papers to have in-degree and out-degree all equal to 18 – we need to dig into this and find out if the API is limited to returning a maximum of API results in a single call, for example.

good_node_ids <- c("30345262", "29624432", "29867837")
bad_node_id <- "I am a pumpkin"
mixed_node_ids <- c(good_node_ids, bad_node_id)

# this is suspicious to me, something seems wrong here
memo_get_pubmed_data(graph, good_node_ids[1])
#> $refs
#>  [1] "28956619" "27755345" "24936257" "24059552" "23149807" "22371471"
#>  [7] "22128211" "21548822" "20840052" "20836729" "20689101" "20036928"
#> [13] "19879747" "19783003" "19718579" "18792926" "18477059" "18256996"
#> [19] "17943776" "17920516" "17499247" "17127265" "16601269" "16556615"
#> [25] "16061595" "15687416" "15665383" "15499145" "15177673" "14725946"
#> [31] "12667636" "12496220" "11150646" "11041438" "10386225" "10225288"
#> [37] "9768331"  "9691157"  "9227580"  "9145435"  "9131263"  "8525349" 
#> [43] "7999032"  "7733322"  "7720662"  "7603298"  "6607683"  "3029864" 
#> 
#> $citedby
#> [1] "35455075" "33190629" "32785463" "31979022"
#> 
#> $num_refs
#> [1] 48
#> 
#> $num_citedby
#> [1] 4


# suspicious that the number of in-cites and out-cites matches, and that it
# matches across both papers! TODO: investigate!
memo_get_pubmed_data(graph, good_node_ids[2])
#> $refs
#> NULL
#> 
#> $citedby
#>  [1] "38693465" "38284617" "36892193" "36293784" "36111739" "35799170"
#>  [7] "35757994" "35565846" "34211314" "34205900" "33976584" "33210456"
#> [13] "32911625" "31835736" "31236026" "31137691" "30882237" "30592165"
#> [19] "30234121" "26061694" "15888210"
#> 
#> $num_refs
#> [1] 0
#> 
#> $num_citedby
#> [1] 21

memo_get_pubmed_data(graph, good_node_ids[3])
#> $refs
#>  [1] "28418291" "28159212" "28135118" "28103607" "28088062" "27936476"
#>  [7] "27889615" "27831549" "27755345" "27684560" "27566870" "27490567"
#> [13] "26903304" "26741949" "26334428" "26238662" "26190380" "26116636"
#> [19] "26086569" "25794697" "25769062" "25728542" "25313944" "24767058"
#> [25] "24752468" "24376898" "24200019" "24169308" "24084373" "23805943"
#> [31] "23769834" "23713111" "23274339" "23271066" "23178065" "23178060"
#> [37] "22371471" "22245400" "21921811" "21638200" "21622093" "21247481"
#> [43] "20946146" "20509779" "19832716" "19516191" "19414822" "19127217"
#> [49] "18845249" "18477059" "18422617" "18419467" "17868474" "17661617"
#> [55] "17517714" "16266685" "15886342" "14657823" "14633102" "12732956"
#> [61] "11740876" "10791723" "10432587" "8489717"  "8274003"  "6323663" 
#> [67] "318682"  
#> 
#> $citedby
#>  [1] "38613123" "38455872" "38129684" "37060103" "36330365" "36193380"
#>  [7] "35874591" "35454995" "35127595" "35124754" "33623528" "33546253"
#> [13] "33276234" "31803156" "31137691" "30873395" "30770541" "30560111"
#> [19] "30443539" "30356694" "30319659"
#> 
#> $num_refs
#> [1] 67
#> 
#> $num_citedby
#> [1] 21


# check that we handle bad node ids in some reliable way, in this case
# it looks we get empty results
memo_get_pubmed_data(graph, bad_node_id)
#> $refs
#> NULL
#> 
#> $citedby
#> NULL
#> 
#> $num_refs
#> [1] 0
#> 
#> $num_citedby
#> [1] 0

#' Check method for `pubmed` graph objects
#'
#' @param graph A `pubmed` graph object
#' @param nodes A **character** vector of node ids. **Can be empty!**
#'
#' @return A **character** vector of node ids that we can reach in the graph.
#'   For example, some nodes ids may not be reachable due to API failures,
#'   or, more generally, permissions failures.
#'
#'   If `nodes` is the empty vector, returns the empty vector. Be sure to
#'   handle this edge case.
check.pubmed <- function(graph, nodes) {
  log_debug(glue("Checking nodes"))

  # handle the case where no nodes are passed
  if (length(nodes) < 1) {
    return(character(0))
  }

  good_nodes <- character(0)

  for (node in nodes) {
    node_data <- memo_get_pubmed_data(graph, node)

    # this is a sufficient check to see if (1) the node is in pubmed, (2)
    # we can pull it's neighborhood, and (3) it has at least one
    # incoming or outgoing citation

    if (node_data$num_refs + node_data$num_citedby > 0) {
      log_trace(glue("Checked node: {node} (good)"))
      good_nodes <- c(good_nodes, node)
      next
    }

    log_trace(glue("Checked node: {node} (bad)"))
  }

  good_nodes
}

Now we test our implementation. To do this, we should give at least one good node id, and at least one bad node id. Only the good node id should be returned.

check(graph, good_node_ids)
#> [1] "30345262" "29624432" "29867837"

check(graph, bad_node_id)
#> character(0)

check(graph, mixed_node_ids)
#> [1] "30345262" "29624432" "29867837"

#' Degree method for `pubmed` graph objects
#'
#' @param graph A `pubmed` graph object
#' @param nodes A **character** vector of node ids. **Cannot be empty.** Should
#'   not contain duplicates if `check()` is properly implemented and does
#'   not output duplicates.
#'
#' @return A list, with two elements, `in_degree` and `out_degree`. Both
#'   should be the same length as `nodes`, and match the order of `nodes`.
#'
node_degrees.pubmed <- function(graph, nodes) {
  log_debug(glue("Getting node degrees"))

  degrees <- list(
    in_degree = integer(length(nodes)),
    out_degree = integer(length(nodes))
  )

  for (i in seq_along(nodes)) {
    log_debug(glue("Getting node degrees for node: {nodes[i]}"))
    node_data <- memo_get_pubmed_data(graph, nodes[i])

    # must treat pubmed like an undirected graph. otherwise it's a citation
    # network, and thus a tree, and thus no pair of nodes is mutually
    # reachable, thus pagerank is not defined

    degrees$in_degree[i] <- node_data$num_citedby + node_data$num_refs
    degrees$out_degree[i] <- node_data$num_citedby + node_data$num_refs

    log_trace(glue("In-degree for node: {degrees$in_degree[i]}"))
    log_trace(glue("Out-degree for node: {degrees$out_degree[i]}"))
  }

  log_debug(glue("Done getting node degrees"))

  degrees
}

To test this method, we should pass a character vector of several good node ids.

# test with a single node
node_degrees(graph, good_node_ids[1])
#> $in_degree
#> [1] 52
#> 
#> $out_degree
#> [1] 52


# test with multiple nodes! this is the key one! this is suspicious, and
# means we need to check if our function memo_get_pubmed_data() is working
node_degrees(graph, good_node_ids)
#> $in_degree
#> [1] 52 21 88
#> 
#> $out_degree
#> [1] 52 21 88

#' Neighborhood method for `pubmed` graph objects
#'
#' @param graph A `pubmed` graph object
#' @param nodes A length one character vector, for a node in the graph with
#'   at least one outgoing edge.
#'
#' @return A **character** vector of node ids for the graph neighborhood.
#'   Should be a vector of length at least one (if the `check()` method was
#'   implemented correctly), and should not contain duplicates.
neighborhood.pubmed <- function(graph, node) {
  if (length(node) > 1) {
    stop("`node` must be a character vector of length one.")
  }

  log_debug(glue("Getting neighborhood: {node}"))
  node_data <- memo_get_pubmed_data(graph, node)
  log_debug(glue("Done getting neighborhood: {node}"))
  unique(node_data$refs, node_data$citedby)
}

neighborhood(graph, good_node_ids[1])
#>  [1] "28956619" "27755345" "24936257" "24059552" "23149807" "22371471"
#>  [7] "22128211" "21548822" "20840052" "20836729" "20689101" "20036928"
#> [13] "19879747" "19783003" "19718579" "18792926" "18477059" "18256996"
#> [19] "17943776" "17920516" "17499247" "17127265" "16601269" "16556615"
#> [25] "16061595" "15687416" "15665383" "15499145" "15177673" "14725946"
#> [31] "12667636" "12496220" "11150646" "11041438" "10386225" "10225288"
#> [37] "9768331"  "9691157"  "9227580"  "9145435"  "9131263"  "8525349" 
#> [43] "7999032"  "7733322"  "7720662"  "7603298"  "6607683"  "3029864"

neighborhood(graph, good_node_ids[2])
#> NULL

neighborhood(graph, good_node_ids[3])
#>  [1] "28418291" "28159212" "28135118" "28103607" "28088062" "27936476"
#>  [7] "27889615" "27831549" "27755345" "27684560" "27566870" "27490567"
#> [13] "26903304" "26741949" "26334428" "26238662" "26190380" "26116636"
#> [19] "26086569" "25794697" "25769062" "25728542" "25313944" "24767058"
#> [25] "24752468" "24376898" "24200019" "24169308" "24084373" "23805943"
#> [31] "23769834" "23713111" "23274339" "23271066" "23178065" "23178060"
#> [37] "22371471" "22245400" "21921811" "21638200" "21622093" "21247481"
#> [43] "20946146" "20509779" "19832716" "19516191" "19414822" "19127217"
#> [49] "18845249" "18477059" "18422617" "18419467" "17868474" "17661617"
#> [55] "17517714" "16266685" "15886342" "14657823" "14633102" "12732956"
#> [61] "11740876" "10791723" "10432587" "8489717"  "8274003"  "6323663" 
#> [67] "318682"

Lastly, you can optionally implement an appr method for your abstract graph subclass. In the appr method for the subclass, you can do things like:

Add functionality to convert a convenient seed node name (in this, possibly something like a DOI) into the internal node name representation (see appr.rtweet_graph() for an example of this)
Checks that you have appropriate authorization to pull information about the seed nodes
Etc, etc

This custom subclass method will run before the general appr.abstract_graph(). We don’t have a particular need to do anything of that here, so we do not.

Debugging

If you are accessing a graph over an API, it’s likely that you will encounter edge cases where the API returns no data, or data in a format that you did not expect. We highly recommend using logging to debug your implementation when this happens, using the logger library. See that logger documentation for details.

Find any errors, fix, and rinse and repeat until you’ve completed the likely unpleasant task of tracking down all the edges cases in the API. In our, we don’t seem to find any edge cases right away.

library(logger)

# set logging threshold for code you just wrote, if desired
log_threshold(TRACE)

# set logging threshold for aPPR package functions, if desired
log_threshold(DEBUG, namespace = "aPPR")

appr(
  graph,                     # the graph to work with
  seeds = good_node_ids[1],  # name of seed node (character)
  epsilon = 0.0005,          # desired approximation quality
  max_visits = 10            # bound computation since this is an example
)
#> Warning: Maximum visits reached. Finishing aPPR calculation early.
#> Personalized PageRank Approximator
#> ----------------------------------
#> 
#>   - number of seeds: 1
#>   - unique nodes visited so far: 10 out of maximum of 10
#>   - total visits so far: 11
#>   - bad nodes so far: 0
#> 
#>   - teleportation constant (alpha): 0.15
#>   - desired approximation error (epsilon): 5e-04
#>   - achieved bound on approximation error: 0.00883575883575884
#>   - length of to visit list: 19
#> 
#> PPR table (see $stats field):
#> # A tibble: 49 × 7
#>    name     regularized        p in_degree out_degree degree_adjusted       r
#>    <chr>          <dbl>    <dbl>     <dbl>      <dbl>           <dbl>   <dbl>
#>  1 30345262  0.000517   0.0811          52         52       0.00156   0.459  
#>  2 28956619  0          0                1          1       0         0.00884
#>  3 27755345  0          0               33         33       0         0.00884
#>  4 24936257  0          0               88         88       0         0.00884
#>  5 24059552  0          0               15         15       0         0.00884
#>  6 23149807  0.00000602 0.000716        14         14       0.0000512 0.00406
#>  7 22371471  0          0             1321       1321       0         0.00884
#>  8 22128211  0          0               33         33       0         0.00884
#>  9 21548822  0          0               20         20       0         0.00884
#> 10 20840052  0          0               53         53       0         0.00884
#> # ℹ 39 more rows