Distances on Directed Graphs in R
at main 203 lines 7.4 kB view raw
1#include "graph.h" 2 3 4//' sample_one_edge_no_comps 5//' 6//' Sample one edge for graph that has no pre-calculated components. Only used 7//' in \code{sample_one_vertex} 8//' 9//' @param edge_map edge_map 10//' @return std::vector of 2 elements: [0] with value of largest connected 11//' component; [1] with random index to one edge that is part of that component. 12//' @noRd 13edge_component graph_sample::sample_one_edge_no_comps (vertex_map_t &vertices, 14 edge_map_t &edge_map) 15{ 16 // TDOD: FIX edge_id_t type defs here! 17 std::unordered_map <vertex_id_t, size_t> components; 18 19 size_t largest_component = 20 graph::identify_graph_components (vertices, components); 21 22 bool in_largest = false; 23 long int e0 = static_cast <long int> (floor (R::runif (0.0, 24 static_cast <double> (edge_map.size ()) - 1.0))); 25 while (!in_largest) 26 { 27 // TODO: The following is an O(N) lookup; maybe just use 28 // edge_map.find (std::to_string (e0)) and handle not found; that'd be 29 // constant time. 30 edge_t this_edge = std::next (edge_map.begin (), e0++)->second; 31 vertex_id_t this_vert = this_edge.get_from_vertex (); 32 if (components [this_vert] == largest_component) 33 in_largest = true; 34 if (e0 >= static_cast <long int> (edge_map.size ())) 35 e0 = 0; // # nocov 36 } 37 38 edge_component edge_comp; 39 edge_comp.component = largest_component; 40 edge_comp.edge = std::next (edge_map.begin (), e0)->first; 41 42 return edge_comp; 43} 44 45//' sample_one_edge_with_comps 46//' 47//' Sample one edge for graph that has pre-calculated components. Only used in 48//' \code{sample_one_vertex} 49//' 50//' @param edge_map edge_map 51//' @return Random index to one edge that is part of the largest connected 52//' component. 53//' @noRd 54edge_id_t graph_sample::sample_one_edge_with_comps (Rcpp::DataFrame graph, 55 edge_map_t &edge_map) 56{ 57 Rcpp::NumericVector component = graph ["component"]; 58 std::uniform_int_distribution <size_t> uni (0, 59 static_cast <size_t> (graph.nrow ()) - 1); 60 long int e0 = static_cast <long int> (floor (R::runif (0.0, 61 static_cast <double> (edge_map.size ()) - 1.0))); 62 while (component (static_cast <size_t> (e0)) > 1L) // can't be invoked in tests in a controlled way 63 e0 = static_cast <long int> (floor (R::runif (0.0, 64 static_cast <double> (edge_map.size ()) - 1.0))); 65 66 return std::next (edge_map.begin (), e0)->first; 67} 68 69vertex_id_t graph_sample::select_random_vert (Rcpp::DataFrame graph, 70 edge_map_t &edge_map, vertex_map_t &vertices) 71{ 72 vertex_id_t this_vert; 73 if (graph::graph_has_components (graph)) 74 { 75 edge_id_t e0 = graph_sample::sample_one_edge_with_comps (graph, edge_map); 76 edge_t this_edge = edge_map.find (e0)->second; 77 this_vert = this_edge.get_from_vertex (); 78 } else 79 { 80 edge_component edge_comp = 81 graph_sample::sample_one_edge_no_comps (vertices, edge_map); 82 edge_t this_edge = edge_map.find (edge_comp.edge)->second; // random edge 83 this_vert = this_edge.get_from_vertex (); 84 } 85 86 return this_vert; 87} 88 89 90//' rcpp_sample_graph 91//' 92//' Randomly sample one connected componnent of a graph 93//' 94//' @param graph graph to be processed 95//' @param nverts_to_sample Number of vertices to sample 96//' @param quiet If TRUE, display progress 97//' 98//' @return Smaller sub-set of \code{graph} 99//' 100//' @noRd 101// [[Rcpp::export]] 102Rcpp::StringVector rcpp_sample_graph (Rcpp::DataFrame graph, 103 size_t nverts_to_sample) 104{ 105 vertex_map_t vertices; 106 edge_map_t edge_map; 107 vert2edge_map_t vert2edge_map; 108 109 bool has_times = graph::graph_from_df (graph, vertices, edge_map, vert2edge_map); 110 has_times = false; // rm unused variable warning 111 112 Rcpp::StringVector edges_out; 113 if (vertices.size () <= nverts_to_sample) 114 return edges_out; // return empty vector # nocov 115 116 std::unordered_map <vertex_id_t, size_t> components; 117 size_t largest_component = 118 graph::identify_graph_components (vertices, components); 119 // simple vert_ids set for quicker random selection: 120 std::unordered_set <vertex_id_t> vert_ids; 121 for (auto v: vertices) 122 if (components [v.first] == largest_component) 123 vert_ids.emplace (v.first); 124 if (vert_ids.size () < nverts_to_sample) 125 { 126 Rcpp::Rcout << "Largest connected component only has " << // # nocov 127 vert_ids.size () << " vertices" << std::endl; // # nocov 128 nverts_to_sample = static_cast <size_t> (vert_ids.size ()); // # nocov 129 } 130 131 vertex_id_t this_vert = 132 graph_sample::select_random_vert (graph, edge_map, vertices); 133 134 // Samples are built by randomly tranwing a vertex list, and inspecting 135 // edges that extend from it. The only effective way to randomly sample a 136 // C++ container is with a std::vector, even though that requires a 137 // std::find each time prior to insertion. It's also useful to know the 138 // size, so this vector is **NOT** reserved, even though it easily could be. 139 // Maybe not the best solution? 140 std::vector <vertex_id_t> vertlist; 141 std::unordered_set <edge_id_t> edgelist; 142 vertlist.push_back (this_vert); 143 144 145 size_t count = 0; 146 while (vertlist.size () < nverts_to_sample) 147 { 148 // initialise random int generator: 149 // TODO: Is this quicker to use a single unif and round each time? 150 std::uniform_int_distribution <size_t> uni (0, 151 static_cast <size_t> (vertlist.size ()) - 1); 152 size_t randv = static_cast <size_t> (floor (R::runif (0.0, 153 static_cast <double> (vertlist.size ()) - 1.0))); 154 this_vert = vertlist [randv]; 155 156 std::unordered_set <edge_id_t> edges = vert2edge_map [this_vert]; 157 for (auto e: edges) 158 { 159 edgelist.insert (e); 160 edge_t this_edge = edge_map.find (e)->second; 161 vertex_id_t vt = this_edge.get_from_vertex (); 162 if (std::find (vertlist.begin(), vertlist.end(), vt) == 163 vertlist.end()) 164 vertlist.push_back (vt); 165 if (vertlist.size () >= nverts_to_sample) 166 { 167 break; 168 } else 169 { 170 vt = this_edge.get_to_vertex (); 171 if (std::find (vertlist.begin(), vertlist.end(), vt) == 172 vertlist.end()) 173 vertlist.push_back (vt); 174 } 175 if (vertlist.size () >= nverts_to_sample) 176 break; 177 } 178 count++; 179 if (count > (100 * nverts_to_sample)) 180 { 181 // likely stuck in some one-way part of graph that can't connect, so 182 // reset to another start node 183 // # nocov start 184 edgelist.clear (); 185 this_vert = 186 graph_sample::select_random_vert (graph, edge_map, vertices); 187 vertlist.clear (); 188 vertlist.push_back (this_vert); 189 count = 0; 190 // # nocov end 191 } 192 } 193 194 size_t nedges = static_cast <size_t> (edgelist.size ()); 195 196 // edgelist is an unordered set, so has to be iteratively inserted 197 edges_out = Rcpp::StringVector (nedges); 198 size_t i = 0; 199 for (auto e: edgelist) 200 edges_out (i++) = e; 201 202 return edges_out; 203}