From 182d89532074ddc63028d1f81ab240e3cf9d4629 Mon Sep 17 00:00:00 2001 From: "Philip (flip) Kromer" Date: Tue, 16 Sep 2014 13:31:10 -0500 Subject: [PATCH] organizing and attic-ing material to determine final outline --- 00-outlines.asciidoc | 10 + ....asciidoc => 02-1-part_one-basics.asciidoc | 0 ...sciidoc => 04-2-part_two-patterns.asciidoc | 0 ...sciidoc => 04-introduction_to_pig.asciidoc | 0 ....asciidoc => 05-map_only_patterns.asciidoc | 20 +- 06-analytic_patterns-old.asciidoc | 188 --------- ....asciidoc => 06-grouping_patterns.asciidoc | 42 ++ ...g.asciidoc => 07-joining_patterns.asciidoc | 129 ++++++ ....asciidoc => 08-ordering_patterns.asciidoc | 0 09-statistics.asciidoc | 370 ------------------ ....asciidoc => 09-uniquing_patterns.asciidoc | 0 10-advanced_patterns.asciidoc | 346 ++++++++++++++++ 10a-event_streams-more.asciidoc | 140 ------- ...c => 11-3-part_three-applications.asciidoc | 0 ...eams.asciidoc => 11-event_streams.asciidoc | 139 +++++++ ...sciidoc => 12-geospatial_analysis.asciidoc | 0 12-placeholder.asciidoc | 1 - ....asciidoc => 12a-geospatial-intro.asciidoc | 0 ...eospatial-aggregations_on_regions.asciidoc | 0 ...iidoc => 12c-geospatial-mechanics.asciidoc | 0 ...iidoc => 12d-geospatial-quadtiles.asciidoc | 0 ...> 12e-geospatial-weather_near_you.asciidoc | 0 ...ic.asciidoc => 12g-geospatial-end.asciidoc | 0 ...12z-geospatial-spatial_manor-data.asciidoc | 0 ...ysis.asciidoc => 13-text_analysis.asciidoc | 0 16-part_four.asciidoc | 1 - 40-4-part_four-practicalities.asciidoc | 1 + ...asciidoc => 41-big_data_ecosystem.asciidoc | 0 ...ta.asciidoc => 42-organizing_data.asciidoc | 0 ...mats.asciidoc => 42a-data_formats.asciidoc | 0 ...jo.asciidoc => 43-filesystem_mojo.asciidoc | 0 ...nging.asciidoc => 45-data_munging.asciidoc | 0 ...r.asciidoc => 45a-wikipedia_other.asciidoc | 0 ...=> 45b-spatial_aggregation-points.asciidoc | 0 ....asciidoc => 45c-wikipedia_corpus.asciidoc | 0 ....asciidoc => 45d-munging-patterns.asciidoc | 0 ...s.asciidoc => 45e-airline_flights.asciidoc | 0 ...her.asciidoc => 45f-daily_weather.asciidoc | 0 ....asciidoc => 45h-other_strategies.asciidoc | 0 50-5-part_five-internals_and_tuning.asciidoc | 1 + 18-java_api.asciidoc => 51-java_api.asciidoc | 0 ...d_pig.asciidoc => 52-advanced_pig.asciidoc | 0 ..._pig.asciidoc => 52a-advanced_pig.asciidoc | 0 ...iidoc => 52a-tuning-wise_and_lazy.asciidoc | 0 ...pig_udfs.asciidoc => 52b-pig_udfs.asciidoc | 0 ...oop_internals-just_enough_for_now.asciidoc | 0 ...3-hadoop_tuning-brave_and_foolish.asciidoc | 0 ...> 53b-hadoop_internals-map_reduce.asciidoc | 0 ....asciidoc => 53b-tuning-pathology.asciidoc | 0 ...iidoc => 53d-use_method_checklist.asciidoc | 0 ...iidoc => 54-hadoop_internals-logs.asciidoc | 0 ...s.asciidoc => 54-hadoop_internals.asciidoc | 0 ...ning.asciidoc => 54-hadoop_tuning.asciidoc | 0 ...sciidoc => 55-hbase_data_modeling.asciidoc | 0 ...hema.asciidoc => 55a-hbase_schema.asciidoc | 0 25-appendix.asciidoc => 80-appendix.asciidoc | 0 ...xercises.asciidoc => 83-exercises.asciidoc | 0 ...iidoc => 83a-overview_of_datasets.asciidoc | 0 ...rences.asciidoc => 83c-references.asciidoc | 0 ...ciidoc => 84d-overview_of_scripts.asciidoc | 0 ...glossary.asciidoc => 85f-glossary.asciidoc | 0 ...k_cover.asciidoc => 86-back_cover.asciidoc | 0 E_and_C.md | 12 + .../02-feedback_and_response.asciidoc | 0 .../08-intro_to_storm+trident.asciidoc | 0 .../11-statistics.asciidoc | 0 .../16-conceptual_model.asciidoc | 0 .../17-machine_learning.asciidoc | 0 .../24-storm+trident-internals.asciidoc | 0 .../24-storm+trident-topology.graffle | 0 .../24-storm+trident-topology.png | Bin .../24a-storm+trident-overview.asciidoc | 0 .../25-storm+trident-tuning.asciidoc | 0 .../90-style_guide.asciidoc | 0 .../99-dumping_ground.asciidoc | 0 .../26-cheatsheet-sql_hive_pig.asciidoc | 17 + ...9-business_applications_of_hadoop.asciidoc | 0 77 files changed, 707 insertions(+), 710 deletions(-) rename 02-part_one.asciidoc => 02-1-part_one-basics.asciidoc (100%) rename 05-part_two.asciidoc => 04-2-part_two-patterns.asciidoc (100%) rename 04-structural_operations.asciidoc => 04-introduction_to_pig.asciidoc (100%) rename 05-analytic_patterns-pipeline_operations.asciidoc => 05-map_only_patterns.asciidoc (98%) delete mode 100644 06-analytic_patterns-old.asciidoc rename 06-analytic_patterns-structural_operations-grouping.asciidoc => 06-grouping_patterns.asciidoc (98%) rename 06-analytic_patterns-structural_operations-joining.asciidoc => 07-joining_patterns.asciidoc (81%) rename 06-analytic_patterns-structural_operations-ordering.asciidoc => 08-ordering_patterns.asciidoc (100%) delete mode 100644 09-statistics.asciidoc rename 06-analytic_patterns-structural_operations-uniquing.asciidoc => 09-uniquing_patterns.asciidoc (100%) delete mode 100644 10a-event_streams-more.asciidoc rename 07-part_three.asciidoc => 11-3-part_three-applications.asciidoc (100%) rename 10-event_streams.asciidoc => 11-event_streams.asciidoc (57%) rename 11-geographic.asciidoc => 12-geospatial_analysis.asciidoc (100%) delete mode 100644 12-placeholder.asciidoc rename 11a-geodata-intro.asciidoc => 12a-geospatial-intro.asciidoc (100%) rename 11c-spatial_aggregations_on_regions.asciidoc => 12c-geospatial-aggregations_on_regions.asciidoc (100%) rename 11c-geospatial_mechanics.asciidoc => 12c-geospatial-mechanics.asciidoc (100%) rename 11d-quadtiles.asciidoc => 12d-geospatial-quadtiles.asciidoc (100%) rename 11e-weather_near_you.asciidoc => 12e-geospatial-weather_near_you.asciidoc (100%) rename 11g-end_of_geographic.asciidoc => 12g-geospatial-end.asciidoc (100%) rename 11z-spatial_manor-data.asciidoc => 12z-geospatial-spatial_manor-data.asciidoc (100%) rename 12-text_analysis.asciidoc => 13-text_analysis.asciidoc (100%) delete mode 100644 16-part_four.asciidoc create mode 100644 40-4-part_four-practicalities.asciidoc rename 07-big_data_ecosystem.asciidoc => 41-big_data_ecosystem.asciidoc (100%) rename 14-organizing_data.asciidoc => 42-organizing_data.asciidoc (100%) rename 11f-data_formats.asciidoc => 42a-data_formats.asciidoc (100%) rename 15-filesystem_mojo.asciidoc => 43-filesystem_mojo.asciidoc (100%) rename 13-data_munging.asciidoc => 45-data_munging.asciidoc (100%) rename 13a-wikipedia_other.asciidoc => 45a-wikipedia_other.asciidoc (100%) rename 11b-spatial_aggregation-points.asciidoc => 45b-spatial_aggregation-points.asciidoc (100%) rename 13c-wikipedia_corpus.asciidoc => 45c-wikipedia_corpus.asciidoc (100%) rename 13d-munging-patterns.asciidoc => 45d-munging-patterns.asciidoc (100%) rename 13e-airline_flights.asciidoc => 45e-airline_flights.asciidoc (100%) rename 13f-daily_weather.asciidoc => 45f-daily_weather.asciidoc (100%) rename 13h-other_strategies.asciidoc => 45h-other_strategies.asciidoc (100%) create mode 100644 50-5-part_five-internals_and_tuning.asciidoc rename 18-java_api.asciidoc => 51-java_api.asciidoc (100%) rename 19-advanced_pig.asciidoc => 52-advanced_pig.asciidoc (100%) rename 19a-advanced_pig.asciidoc => 52a-advanced_pig.asciidoc (100%) rename 22a-tuning-wise_and_lazy.asciidoc => 52a-tuning-wise_and_lazy.asciidoc (100%) rename 19b-pig_udfs.asciidoc => 52b-pig_udfs.asciidoc (100%) rename 12-hadoop_internals-just_enough_for_now.asciidoc => 53-hadoop_internals-just_enough_for_now.asciidoc (100%) rename 23-hadoop_tuning-brave_and_foolish.asciidoc => 53-hadoop_tuning-brave_and_foolish.asciidoc (100%) rename 21b-hadoop_internals-map_reduce.asciidoc => 53b-hadoop_internals-map_reduce.asciidoc (100%) rename 22b-tuning-pathology.asciidoc => 53b-tuning-pathology.asciidoc (100%) rename 22d-use_method_checklist.asciidoc => 53d-use_method_checklist.asciidoc (100%) rename ha1b-hadoop_internals-logs.asciidoc => 54-hadoop_internals-logs.asciidoc (100%) rename 21-hadoop_internals.asciidoc => 54-hadoop_internals.asciidoc (100%) rename 22-hadoop_tuning.asciidoc => 54-hadoop_tuning.asciidoc (100%) rename 20-hbase_data_modeling.asciidoc => 55-hbase_data_modeling.asciidoc (100%) rename 20a-hbase_schema.asciidoc => 55a-hbase_schema.asciidoc (100%) rename 25-appendix.asciidoc => 80-appendix.asciidoc (100%) rename 27-exercises.asciidoc => 83-exercises.asciidoc (100%) rename 23a-overview_of_datasets.asciidoc => 83a-overview_of_datasets.asciidoc (100%) rename 25c-references.asciidoc => 83c-references.asciidoc (100%) rename 25d-overview_of_scripts.asciidoc => 84d-overview_of_scripts.asciidoc (100%) rename 25f-glossary.asciidoc => 85f-glossary.asciidoc (100%) rename 26-back_cover.asciidoc => 86-back_cover.asciidoc (100%) create mode 100644 E_and_C.md rename 02-feedback_and_response.asciidoc => attic/02-feedback_and_response.asciidoc (100%) rename 08-intro_to_storm+trident.asciidoc => attic/08-intro_to_storm+trident.asciidoc (100%) rename 09x-statistics-to_integrate.asciidoc => attic/11-statistics.asciidoc (100%) rename 16-conceptual_model.asciidoc => attic/16-conceptual_model.asciidoc (100%) rename 17-machine_learning.asciidoc => attic/17-machine_learning.asciidoc (100%) rename 24-storm+trident-internals.asciidoc => attic/24-storm+trident-internals.asciidoc (100%) rename 24-storm+trident-topology.graffle => attic/24-storm+trident-topology.graffle (100%) rename 24-storm+trident-topology.png => attic/24-storm+trident-topology.png (100%) rename 24a-storm+trident-overview.asciidoc => attic/24a-storm+trident-overview.asciidoc (100%) rename 25-storm+trident-tuning.asciidoc => attic/25-storm+trident-tuning.asciidoc (100%) rename 90-style_guide.asciidoc => attic/90-style_guide.asciidoc (100%) rename 99-dumping_ground.asciidoc => attic/99-dumping_ground.asciidoc (100%) create mode 100644 cheatsheets/26-cheatsheet-sql_hive_pig.asciidoc rename 99-business_applications_of_hadoop.asciidoc => supplementary/99-business_applications_of_hadoop.asciidoc (100%) diff --git a/00-outlines.asciidoc b/00-outlines.asciidoc index 0d4109f..763dc66 100644 --- a/00-outlines.asciidoc +++ b/00-outlines.asciidoc @@ -1,4 +1,14 @@ +9. Statistics +10. Event streams -- has some good examples, no real flow. The topic I'd be most excied to get in the book is the geo-ip matching, which demonstrates a range join. +12, 21, 22, 23. Hadoop internals and tuning. As you can see just from the number of files involved this is particularly disorganized. If you and I worked out a structure of what should be there I can organize the spare parts around it. +13. Data munging. This is some of the earliest material and thus some of the messiest. I don't believe this is worth reworking. +14. Organizing data -- only real material here is a rundown of data formats. Rough. +15. Filesystem mojo and `cat` herding -- runs down the commandline tools: wc, cut, etc. This is actually in decent shape, but should become an appendix I think. +18. Native Java API -- I'd like to have this chapter in there with either the content being the single sentence "Don't", or that sentence plus one prose paragraph saying you should write Hive or Pig UDFs instead. +19. Advanced Pig -- the material that's there, on pig config variables and two of the fancy joins, is not too messy. I'd like to at least tell readers about the replicated join, and probably even move it into the earlier chapters. The most we should do here would be to also describe an inline Python UDF and a Java UDF, but there's no material for that (though I do have code examples of UDFs) + + 10. **Event log** - geo IP via range query - sessionizing, user paths diff --git a/02-part_one.asciidoc b/02-1-part_one-basics.asciidoc similarity index 100% rename from 02-part_one.asciidoc rename to 02-1-part_one-basics.asciidoc diff --git a/05-part_two.asciidoc b/04-2-part_two-patterns.asciidoc similarity index 100% rename from 05-part_two.asciidoc rename to 04-2-part_two-patterns.asciidoc diff --git a/04-structural_operations.asciidoc b/04-introduction_to_pig.asciidoc similarity index 100% rename from 04-structural_operations.asciidoc rename to 04-introduction_to_pig.asciidoc diff --git a/05-analytic_patterns-pipeline_operations.asciidoc b/05-map_only_patterns.asciidoc similarity index 98% rename from 05-analytic_patterns-pipeline_operations.asciidoc rename to 05-map_only_patterns.asciidoc index afa7cca..5bbfecb 100644 --- a/05-analytic_patterns-pipeline_operations.asciidoc +++ b/05-map_only_patterns.asciidoc @@ -1,9 +1,9 @@ -== Analytic Patterns part 1: Pipeline Operations +== Analytic Patterns part 1: Map-only Operations -This chapter focuses exclusively on what we'll call 'pipelineable operations'. -A pipelineable operations is one that can handle each record in isolation, like the translator chimps from Chimpanzee & Elephant's first job. That property makes those operations trivially parallelizable: they require no reduce phase of their own. +This chapter focuses exclusively on what we'll call 'Map-only operations'. +A map-only operation is one that can handle each record in isolation, like the translator chimps from Chimpanzee & Elephant's first job. That property makes those operations trivially parallelizable: they require no reduce phase of their own. -When a script has only pipelineable operations, they give rise to one mapper-only job which executes the composed pipeline stages. When pipelinable operations are combined with the structural operations you'll meet in the next chapter, they are composed with the stages of the mapper or reducer (depending on whether they come before or after the structural operation). +When a script has only map-only operations, they give rise to one mapper-only job which executes the composed pipeline stages. When map-only operations are combined with the structural operations you'll meet in the next chapter, they are composed with the stages of the mapper or reducer (depending on whether they come before or after the structural operation). All of these are listed first and together for two reasons. One, they are largely fundamental; it's hard to get much done without `FILTER` or `FOREACH`. Two, the way you reason about the performance impact of these operations is largely the same. Since these operations are trivially paralellizable, they scale efficiently and the computation cost rarely impedes throughput. And when pipelined, their performance cost can be summarized as "kids eat free with purchase of adult meal". For datasets of any material size, it's very rare that the cost of preliminary or follow-on processing rivals the cost of the reduce phase. Finally, since these operations handle records in isolation, their memory impact is modest. So learn to think of these together. @@ -70,7 +70,7 @@ Blocks like the following will show up after each of the patterns or groups of p - Programmers take note: `AND`, `OR` -- not `&&`, `||`. * _Output Count_ -- (_How many records in the output: fewer, same, more, explosively more?_) Zero to 100% of the input record count. Data size will decrease accordingly * _Records_ -- (_A sketch of what the records coming out of this operation look like_) Identical to input -* _Data Flow_ -- (_The Hadoop jobs this operation gives rise to. In this chapter, all the lines will look like this one; in the next chapters that will change_) Pipelinable: it's composed onto the end of the preceding map or reduce, and if it stands alone becomes a map-only job. +* _Data Flow_ -- (_The Hadoop jobs this operation gives rise to. In this chapter, all the lines will look like this one; in the next chapters that will change_) Map-Only: it's composed onto the end of the preceding map or reduce, and if it stands alone becomes a map-only job. * _Exercises for You_ -- (_A mission to carry forward, if you choose. Don't go looking for an answer section -- we haven't done any of them. In many cases you'll be the first to find the answer._) Play around with `null`s and the conditional operators until you have a good sense of its quirks. * _See Also_ -- (_Besides the patterns in its section of the book, what other topics might apply if you're considering this one? Sometimes this is another section in the book, sometimes it's a pointer elsewhere_) The Distinct operations, some Set operations, and some Joins are also used to eliminate records according to some criteria. See especially the Semi-Join and Anti-Join (REF), which select or reject matches against a large list of keys. @@ -123,7 +123,7 @@ NOTE: Sadly, the Nobel Prize-winning physicists Gerard 't Hooft, Louis-Victor Pi - You're far better off learning one extra thing to do with a regular expression than most of the other string conditional functions Pig offers. - ... and enough other Importants to Know that we made a sidebar of them (REF). * _Records_ -- You can use this in a filter clause but also anywhere else an expression is permitted, like the preceding snippet -* _Data Flow_ -- Pipelinable: it's composed onto the end of the preceding map or reduce, and if it stands alone becomes a map-only job. +* _Data Flow_ -- Map-Only: it's composed onto the end of the preceding map or reduce, and if it stands alone becomes a map-only job. * _Exercises for You_ -- Follow the http://regexp.info/tutorial.html[regexp.info tutorial], but _only up to the part on Grouping & Capturing_. The rest you are far better off picking up once you find you need it. * _See Also_ -- The Pig `REGEX_EXTRACT` and http://pig.apache.org/docs/r0.12.0/func.html#replace[`REPLACE`] functions. Java's http://docs.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html#sum[Regular Expression] documentation for details on its pecadilloes (but not for an education about regular expressions). @@ -152,7 +152,7 @@ The general case is handled bu using a join, as described in the next chapter (R * _Hello, SQL Users_ -- This isn't anywhere near as powerful as SQL's `IN` expression. Most importantly, you can't supply another table as the list. * _Important to Know_ -- A regular expression alternation is often the right choice instead. * _Output Count_ -- As many records as the cardinality of its key, i.e. the number of distinct values. Data size should decrease greatly. -* _Data Flow_ -- Pipelinable: it's composed onto the end of the preceding map or reduce, and if it stands alone becomes a map-only job. +* _Data Flow_ -- Map-Only: it's composed onto the end of the preceding map or reduce, and if it stands alone becomes a map-only job. === Project Only Chosen Columns by Name @@ -194,7 +194,7 @@ The first projection puts the `home_team_id` into the team slot, renaming it `te * _Important to Know_ -- As you can see, we take a lot of care visually aligning subexpressions within the code snippets. That's not because we've tidied up the house for students coming over -- this is what the code we write and the code our teammates expect us to write looks like. * _Output Count_ -- Exactly the same as the input. * _Records_ -- However you define them to be -* _Data Flow_ -- Pipelinable: it's composed onto the end of the preceding map or reduce, and if it stands alone becomes a map-only job. +* _Data Flow_ -- Map-Only: it's composed onto the end of the preceding map or reduce, and if it stands alone becomes a map-only job. * _See Also_ -- "Assembling Literals with Complex Type" (REF) ==== Extracting a Random Sample of Records @@ -219,7 +219,7 @@ Experienced software developers will reach for a "seeding" function -- such as R - The DataFu package has UDFs for sampling with replacement and other advanced features. * _Output Count_ -- Determined by the sampling fraction. As a rule of thumb, variances of things are square-root-ish; expect the size of a 10% sample to be in the 7%-13% range. * _Records_ -- Identical to the input -* _Data Flow_ -- Pipelinable: it's composed onto the end of the preceding map or reduce, and if it stands alone becomes a map-only job. +* _Data Flow_ -- Map-Only: it's composed onto the end of the preceding map or reduce, and if it stands alone becomes a map-only job. * _Exercises for You_ -- Modify Pig's SAMPLE function to accept a seed parameter, and submit that patch back to the open-source project. This is a bit harder to do than it seems: sampling is key to efficient sorting and so the code to sample data is intertwingled with a lot of core functionality. ==== Extracting a Consistent Sample of Records by Key @@ -242,7 +242,7 @@ We called this a terrible hash function, but it does fit the bill. When applied - If you'll be spending a bunch of time with a data set, using any kind of random sample to prepare your development sample might be a stupid idea. You'll notice that Red Sox players show up a lot of times in our examples -- that's because our development samples are "seasons by Red Sox players" and "seasons from 2000-2010", which lets us make good friends with the data. * _Output Count_ -- Determined by the sampling fraction. As a rule of thumb, variances of things are square-root-ish; expect the size of a 10% sample to be in the 7%-13% range. * _Records_ -- Identical to the input -* _Data Flow_ -- Pipelinable: it's composed onto the end of the preceding map or reduce, and if it stands alone becomes a map-only job. +* _Data Flow_ -- Map-Only: it's composed onto the end of the preceding map or reduce, and if it stands alone becomes a map-only job. ==== Sampling Carelessly by Only Loading Some `part-` Files diff --git a/06-analytic_patterns-old.asciidoc b/06-analytic_patterns-old.asciidoc deleted file mode 100644 index a9379d5..0000000 --- a/06-analytic_patterns-old.asciidoc +++ /dev/null @@ -1,188 +0,0 @@ - -=== How a group works - ------- -team_n_parks = FOREACH (GROUP park_teams BY (team_id,year_id)) GENERATE - group.team_id, COUNT_STAR(park_teams) AS n_parks; -vagabonds = FILTER team_n_parks BY n_parks >= 3; - -DUMP vagabonds; -(CL4,7) -(CLE,5) -(WS3,4) -(CLE,3) -(DET,3) -... ------- - ------- -mapper(array_fields_of: ParkTeamYear) do |park_id, team_id, year_id, beg_date, end_date, n_games| - yield [team_id, year_id] -end - -# In effect, what is happening in Java: -reducer do |(team_id, year_id), stream| - n_parks = 0 - stream.each do |*_| - n_parks += 1 - end - yield [team_id, year_id, n_parks] if n_parks > 1 -end ------- - -(In actual practice, the ruby version would just call `n_parks = stream.size` rather than iterating) - - -TODO in part on groups note As Jon Covent says, "Bags are what makes Pig Awesome". SQL doesn't have them, and they bring extraordinary power. They can be of arbitrarily large size, present an ad-hoc object representation, and within limits can themselves be limited, transformed, ordered, threaded, and joined. -They can't be indexed into, and unless you explicitly say so are not ordered. - -TODO add diagram showing inner bag like the ThinkBig demo (and reference it) - -TODO: a JOIN is used for: direct foreign key join; matching records on a criterion, possibly sparsely; set intersection. - -The core operation you will use to put records from one table into context with data from another table is the JOIN. A common application of the JOIN is to reunite data that has been normalized -- that is to say, where the database tables are organized to eliminate any redundancy. For example, each Retrosheet game log lists the ballpark in which it was played but, of course, it does not repeat the full information about that park within every record. Later in the book, (TODO: REF) we will want to label each game with its geo-coordinates so we can augment each with official weather data measurements. - -To join the game_logs table with the parks table, extracting the game time and park geocoordinates, run the following Pig command: - ------- -gls_with_parks_j = JOIN - parks BY (park_id), - game_logs BY (park_id); -explain gls_with_parks_j; -gls_with_parks = FOREACH gls_with_parks_j GENERATE - (game_id, gamelogs.park_id, game_time, park_lng, statium_lat); -explain gls_with_parks; -(TODO output of explain command) ------- - -The output schema of the new `gls_with_parks` table has all the fields from the `parks` table first (because it's first in the join statement), stapled to all the fields from the `game_logs` table. We only want some of the fields, so immediately following the JOIN is a FOREACH to extract what we're interested in. Note there are now two 'park_id' columns, one from each dataset, so in the subsequent FOREACH, we need to dereference the column name with the table from which it came. (TODO: check that Pig does push the projection of fields up above the JOIN). If you run the script, 'examples/geo/baseball_weather/geolocate_games.pig' you will see that its output has example as many records as there are 'game_logs' because there is exactly one entry in the 'parks' table for each park. - -In the general case, though, a JOIN can be many to many. Suppose we wanted to build a table listing all the home ballparks for every player over their career. The 'player_seasons' table has a row for each year and team over their career. If a player changed teams mid year, there will be two rows for that player. The 'park_years' table, meanwhile, has rows by season for every team and year it was used as a home stadium. Some ballparks have served as home for multiple teams within a season and in other cases (construction or special circumstances), teams have had multiple home ballparks within a season. - -The Pig script (TODO: write script) includes the following JOIN: - ------- -JOIN -player_park_years=JOIN - parks(year,team_ID), - players(year,team_ID); -explain_player_park_year; ------- - -First notice that the JOIN expression has multiple columns in this case separated by commas; you can actually enter complex expressions here -- almost all (but not all) the things you do within a FOREACH. If you examine the output file (TODO: name of output file), you will notice it has appreciably more lines than the input 'player' file. For example (TODO: find an example of a player with multiple teams having multiple parks), in year x player x played for the x and the y and y played in stadiums p and q. The one line in the 'players' table has turned into three lines in the 'players_parks_years' table. - -The examples we have given so far are joining on hard IDs within closely-related datasets, so every row was guaranteed to have a match. It is frequently the case, however, you will join tables having records in one or both tables that will fail to find a match. The 'parks_info' datasets from Retrosheet only lists the city name of each ballpark, not its location. In this case we found a separate human-curated list of ballpark geolocations, but geolocating records -- that is, using a human-readable location name such as "Austin, Texas" to find its nominal geocoordinates (-97.7,30.2) -- is a common task; it is also far more difficult than it has any right to be, but a useful first step is match the location names directly against a gazette of populated place names such as the open source Geonames dataset. - -Run the script (TODO: name of script) that includes the following JOIN: - ------- -park_places = JOIN - parks BY (location) LEFT OUTER, - places BY (concatenate(city, ", ", state); -DESCRIBE park_places; ------- - -In this example, there will be some parks that have no direct match to location names and, of course, there will be many, many places that do not match a park. The first two JOINs we did were "inner" JOINs -- the output contains only rows that found a match. In this case, we want to keep all the parks, even if no places matched but we do not want to keep any places that lack a park. Since all rows from the left (first most dataset) will be retained, this is called a "left outer" JOIN. If, instead, we were trying to annotate all places with such parks as could be matched -- producing exactly one output row per place -- we would use a "right outer" JOIN instead. If we wanted to do the latter but (somewhat inefficiently) flag parks that failed to find a match, you would use a "full outer" JOIN. (Full JOINs are pretty rare.) - -TODO: discuss use of left join for set intersection. - -In a Pig JOIN it is important to order the tables by size -- putting the smallest table first and the largest table last. (You'll learn why in the "Map/Reduce Patterns" (TODO: REF) chapter.) So while a right join is not terribly common in traditional SQL, it's quite valuable in Pig. If you look back at the previous examples, you will see we took care to always put the smaller table first. For small tables or tables of similar size, it is not a big deal -- but in some cases, it can have a huge impact, so get in the habit of always following this best practice. - ------- -NOTE -A Pig join is outwardly similar to the join portion of a SQL SELECT statement, but notice that although you can place simple expressions in the join expression, you can make no further manipulations to the data whatsoever in that statement. Pig's design philosophy is that each statement corresponds to a specific data transformation, making it very easy to reason about how the script will run; this makes the typical Pig script more long-winded than corresponding SQL statements but clearer for both human and robot to understand. ------- - -==== Reassemble a Vertically Partitioned Table - -Another reason to split data across tables is 'vertical partitioning': storing fields that are very large or seldom used in context within different tables. That's the case with the Wikipedia article tables -- the geolocation information is only relevant for geodata analysis; the article text is both large and not always relevant. - - - -Every stadium a player has played in. (We're going to cheat on the detail of -multiple stints and credit every player with all stadiums visited by the team -of his first stint in a season - ------- - -- there are only a few many-to-many cases, so the 89583 seasons in batting - -- table expands to only 91904 player-park-years. But it's a cross product, so - -- beware. -SELECT COUNT(*) FROM batting bat WHERE bat.stint = 1; -SELECT bat.player_id, bat.team_id, bat.year_id, pty.park_id - FROM batting bat - INNER JOIN park_team_years pty - ON bat.year_id = pty.year_id AND bat.team_id = pty.team_id - WHERE bat.stint = 1 - ORDER BY player_id - ; ------- - -What if you only want the distinct player-team-years? -You might naively do a join and then a group by, -or a join and then distinct. Don't do that. - ------- - -- DON'T DO THE (pig equivalent) OF THIS to find the distinct teams, years and parks; - -- it's an extra reduce. -SELECT bat.player_id, bat.nameCommon, - GROUP_CONCAT(DISTINCT pty.park_id) AS park_ids, COUNT(DISTINCT pty.park_id) AS n_parks, - GROUP_CONCAT(DISTINCT bat.team_id) AS team_ids, - MIN(bat.year_id) AS begYear, MAX(bat.year_id) AS endYear - FROM bat_war bat - INNER JOIN park_team_years pty - ON bat.year_id = pty.year_id AND bat.team_id = pty.team_id - WHERE bat.stint = 1 AND player_id IS NOT NULL - GROUP BY player_id - HAVING begYear > 1900 - ORDER BY n_parks DESC, player_id ASC - ; - - Join bat_yr on (team_id, year_id), pty by (team_id, year_id); - FOREACH @ GENERATE bat_years::player_id, park_id; - Group by player_id - Distinct parks - - Cogroup baty by (team_id, year_id), pty by (team_id, year_id); - distinct park_id, ------- - -So now we disclose the most important thing that SQL experts need to break -their brains of: - -In SQL, the JOIN is supreme. -In Pig, the GROUP is supreme - -A JOIN is, for the most part, just sugar around a COGROUP-and-FLATTEN. -Very often you'll find the simplest path is through COGROUP not JOIN. - -In this case, if you start by thinking of the group, you'll see you can eliminate a whole reduce. - -(show pig, including a DISTINCT in the fancy-style FOREACH) - -==== Join Practicalities - -(add note) Joins on null values are dropped even when both are null. Filter nulls. (I can't come up with a good example of this) -(add note) in contrast, all elements with null in a group _will_ be grouped as null. This can be dangerous when large number of nulls: all go to same reducer - - -=== SQL-to-Pig-to-Hive Cheatsheet - -* SELECT..WHERE -* SELECT...LIMit -* GROUP BY...HAVING -* SELECT WHERE... ORDER BY -* SELECT WHERE... SORT BY (just use reducer sort) ~~ (does reducer in Pig guarantee this?) -* SELECT … DISTRIBUTE BY … SORT BY ... -* SELECT ... CLUSTER BY (equiv of distribute by X sort by X) -* Indexing tips -* CASE...when...then -* Block Sampling / Input pruning -* SELECT country_name, indicator_name, `2011` AS trade_2011 FROM wdi WHERE (indicator_name = 'Trade (% of GDP)' OR indicator_name = 'Broad money (% of GDP)') AND `2011` IS NOT NULL CLUSTER BY indicator_name; - -SELECT columns or computations FROM table WHERE condition GROUP BY columns HAVING condition ORDER BY column [ASC | DESC] LIMIT offset,count; - -==== Ready Reckoner: How fast should your Pig fly? --> not sure what this is - -TODO: move to the first tuning chapter. - -The idea is to have you run through a set of pig scripts with datasets of defined size, measuring the throughput of the core operations. The result is a ready reckoner that lets you estimate how long your job _should_ take (and how many map-reduce stages it will use). diff --git a/06-analytic_patterns-structural_operations-grouping.asciidoc b/06-grouping_patterns.asciidoc similarity index 98% rename from 06-analytic_patterns-structural_operations-grouping.asciidoc rename to 06-grouping_patterns.asciidoc index 8231ed6..d6f76ee 100644 --- a/06-analytic_patterns-structural_operations-grouping.asciidoc +++ b/06-grouping_patterns.asciidoc @@ -406,6 +406,48 @@ See the Pattern in Use for the next section too (REF). * _Records_ -- Something like `mykey, aggregated_value, aggregated_value, ...` * _Data Flow_ -- Map, Combiner & Reduce; combiners quite effective unless cardinality is very high. + + + +// === How a group works +// +// ------ +// team_n_parks = FOREACH (GROUP park_teams BY (team_id,year_id)) GENERATE +// group.team_id, COUNT_STAR(park_teams) AS n_parks; +// vagabonds = FILTER team_n_parks BY n_parks >= 3; +// +// DUMP vagabonds; +// (CL4,7) +// (CLE,5) +// (WS3,4) +// (CLE,3) +// (DET,3) +// ... +// ------ +// +// ------ +// mapper(array_fields_of: ParkTeamYear) do |park_id, team_id, year_id, beg_date, end_date, n_games| +// yield [team_id, year_id] +// end +// +// # In effect, what is happening in Java: +// reducer do |(team_id, year_id), stream| +// n_parks = 0 +// stream.each do |*_| +// n_parks += 1 +// end +// yield [team_id, year_id, n_parks] if n_parks > 1 +// end +// ------ +// +// (In actual practice, the ruby version would just call `n_parks = stream.size` rather than iterating) +// +// +// TODO in part on groups note As Jon Covent says, "Bags are what makes Pig Awesome". SQL doesn't have them, and they bring extraordinary power. They can be of arbitrarily large size, present an ad-hoc object representation, and within limits can themselves be limited, transformed, ordered, threaded, and joined. +// They can't be indexed into, and unless you explicitly say so are not ordered. +// +// TODO add diagram showing inner bag like the ThinkBig demo (and reference it) + ==== Completely Summarizing a Field diff --git a/06-analytic_patterns-structural_operations-joining.asciidoc b/07-joining_patterns.asciidoc similarity index 81% rename from 06-analytic_patterns-structural_operations-joining.asciidoc rename to 07-joining_patterns.asciidoc index 8e404f5..12ba7c2 100644 --- a/06-analytic_patterns-structural_operations-joining.asciidoc +++ b/07-joining_patterns.asciidoc @@ -213,6 +213,135 @@ stats_and_fatness = FOREACH (JOIN fatness BY player_id, stats BY player_id) // The output of the Join job has one line for each discrete combination of A and B. As you will notice in our Wukong version of the Join, the job receives all the A records for a given key in order, strictly followed by all the B records for that key in order. We have to accumulate all the A records in memory so we know what rows to emit for each B record. All the A records have to be held in memory at the same time, while all the B records simply flutter by; this means that if you have two datasets of wildly different sizes or distribution, it is worth ensuring the Reducer receives the smaller group first. In Wukong, you do this by giving it an earlier-occurring field group label; in Pig, always put the table with the largest number of records per key last in the statement. + +// +// TODO: a JOIN is used for: direct foreign key join; matching records on a criterion, possibly sparsely; set intersection. +// +// The core operation you will use to put records from one table into context with data from another table is the JOIN. A common application of the JOIN is to reunite data that has been normalized -- that is to say, where the database tables are organized to eliminate any redundancy. For example, each Retrosheet game log lists the ballpark in which it was played but, of course, it does not repeat the full information about that park within every record. Later in the book, (TODO: REF) we will want to label each game with its geo-coordinates so we can augment each with official weather data measurements. +// +// To join the game_logs table with the parks table, extracting the game time and park geocoordinates, run the following Pig command: +// +// ------ +// gls_with_parks_j = JOIN +// parks BY (park_id), +// game_logs BY (park_id); +// explain gls_with_parks_j; +// gls_with_parks = FOREACH gls_with_parks_j GENERATE +// (game_id, gamelogs.park_id, game_time, park_lng, statium_lat); +// explain gls_with_parks; +// (TODO output of explain command) +// ------ +// +// The output schema of the new `gls_with_parks` table has all the fields from the `parks` table first (because it's first in the join statement), stapled to all the fields from the `game_logs` table. We only want some of the fields, so immediately following the JOIN is a FOREACH to extract what we're interested in. Note there are now two 'park_id' columns, one from each dataset, so in the subsequent FOREACH, we need to dereference the column name with the table from which it came. (TODO: check that Pig does push the projection of fields up above the JOIN). If you run the script, 'examples/geo/baseball_weather/geolocate_games.pig' you will see that its output has example as many records as there are 'game_logs' because there is exactly one entry in the 'parks' table for each park. +// +// In the general case, though, a JOIN can be many to many. Suppose we wanted to build a table listing all the home ballparks for every player over their career. The 'player_seasons' table has a row for each year and team over their career. If a player changed teams mid year, there will be two rows for that player. The 'park_years' table, meanwhile, has rows by season for every team and year it was used as a home stadium. Some ballparks have served as home for multiple teams within a season and in other cases (construction or special circumstances), teams have had multiple home ballparks within a season. +// +// The Pig script (TODO: write script) includes the following JOIN: +// +// ------ +// JOIN +// player_park_years=JOIN +// parks(year,team_ID), +// players(year,team_ID); +// explain_player_park_year; +// ------ +// +// First notice that the JOIN expression has multiple columns in this case separated by commas; you can actually enter complex expressions here -- almost all (but not all) the things you do within a FOREACH. If you examine the output file (TODO: name of output file), you will notice it has appreciably more lines than the input 'player' file. For example (TODO: find an example of a player with multiple teams having multiple parks), in year x player x played for the x and the y and y played in stadiums p and q. The one line in the 'players' table has turned into three lines in the 'players_parks_years' table. +// +// The examples we have given so far are joining on hard IDs within closely-related datasets, so every row was guaranteed to have a match. It is frequently the case, however, you will join tables having records in one or both tables that will fail to find a match. The 'parks_info' datasets from Retrosheet only lists the city name of each ballpark, not its location. In this case we found a separate human-curated list of ballpark geolocations, but geolocating records -- that is, using a human-readable location name such as "Austin, Texas" to find its nominal geocoordinates (-97.7,30.2) -- is a common task; it is also far more difficult than it has any right to be, but a useful first step is match the location names directly against a gazette of populated place names such as the open source Geonames dataset. +// +// Run the script (TODO: name of script) that includes the following JOIN: +// +// ------ +// park_places = JOIN +// parks BY (location) LEFT OUTER, +// places BY (concatenate(city, ", ", state); +// DESCRIBE park_places; +// ------ +// +// In this example, there will be some parks that have no direct match to location names and, of course, there will be many, many places that do not match a park. The first two JOINs we did were "inner" JOINs -- the output contains only rows that found a match. In this case, we want to keep all the parks, even if no places matched but we do not want to keep any places that lack a park. Since all rows from the left (first most dataset) will be retained, this is called a "left outer" JOIN. If, instead, we were trying to annotate all places with such parks as could be matched -- producing exactly one output row per place -- we would use a "right outer" JOIN instead. If we wanted to do the latter but (somewhat inefficiently) flag parks that failed to find a match, you would use a "full outer" JOIN. (Full JOINs are pretty rare.) +// +// TODO: discuss use of left join for set intersection. +// +// In a Pig JOIN it is important to order the tables by size -- putting the smallest table first and the largest table last. (You'll learn why in the "Map/Reduce Patterns" (TODO: REF) chapter.) So while a right join is not terribly common in traditional SQL, it's quite valuable in Pig. If you look back at the previous examples, you will see we took care to always put the smaller table first. For small tables or tables of similar size, it is not a big deal -- but in some cases, it can have a huge impact, so get in the habit of always following this best practice. +// +// ------ +// NOTE +// A Pig join is outwardly similar to the join portion of a SQL SELECT statement, but notice that although you can place simple expressions in the join expression, you can make no further manipulations to the data whatsoever in that statement. Pig's design philosophy is that each statement corresponds to a specific data transformation, making it very easy to reason about how the script will run; this makes the typical Pig script more long-winded than corresponding SQL statements but clearer for both human and robot to understand. +// ------ +// +// ==== Reassemble a Vertically Partitioned Table +// +// Another reason to split data across tables is 'vertical partitioning': storing fields that are very large or seldom used in context within different tables. That's the case with the Wikipedia article tables -- the geolocation information is only relevant for geodata analysis; the article text is both large and not always relevant. +// +// +// +// Every stadium a player has played in. (We're going to cheat on the detail of +// multiple stints and credit every player with all stadiums visited by the team +// of his first stint in a season +// +// ------ +// -- there are only a few many-to-many cases, so the 89583 seasons in batting +// -- table expands to only 91904 player-park-years. But it's a cross product, so +// -- beware. +// SELECT COUNT(*) FROM batting bat WHERE bat.stint = 1; +// SELECT bat.player_id, bat.team_id, bat.year_id, pty.park_id +// FROM batting bat +// INNER JOIN park_team_years pty +// ON bat.year_id = pty.year_id AND bat.team_id = pty.team_id +// WHERE bat.stint = 1 +// ORDER BY player_id +// ; +// ------ +// +// What if you only want the distinct player-team-years? +// You might naively do a join and then a group by, +// or a join and then distinct. Don't do that. +// +// ------ +// -- DON'T DO THE (pig equivalent) OF THIS to find the distinct teams, years and parks; +// -- it's an extra reduce. +// SELECT bat.player_id, bat.nameCommon, +// GROUP_CONCAT(DISTINCT pty.park_id) AS park_ids, COUNT(DISTINCT pty.park_id) AS n_parks, +// GROUP_CONCAT(DISTINCT bat.team_id) AS team_ids, +// MIN(bat.year_id) AS begYear, MAX(bat.year_id) AS endYear +// FROM bat_war bat +// INNER JOIN park_team_years pty +// ON bat.year_id = pty.year_id AND bat.team_id = pty.team_id +// WHERE bat.stint = 1 AND player_id IS NOT NULL +// GROUP BY player_id +// HAVING begYear > 1900 +// ORDER BY n_parks DESC, player_id ASC +// ; +// +// Join bat_yr on (team_id, year_id), pty by (team_id, year_id); +// FOREACH @ GENERATE bat_years::player_id, park_id; +// Group by player_id +// Distinct parks +// +// Cogroup baty by (team_id, year_id), pty by (team_id, year_id); +// distinct park_id, +// ------ +// +// So now we disclose the most important thing that SQL experts need to break +// their brains of: +// +// In SQL, the JOIN is supreme. +// In Pig, the GROUP is supreme +// +// A JOIN is, for the most part, just sugar around a COGROUP-and-FLATTEN. +// Very often you'll find the simplest path is through COGROUP not JOIN. +// +// In this case, if you start by thinking of the group, you'll see you can eliminate a whole reduce. +// +// (show pig, including a DISTINCT in the fancy-style FOREACH) +// +// ==== Join Practicalities +// +// (add note) Joins on null values are dropped even when both are null. Filter nulls. (I can't come up with a good example of this) +// (add note) in contrast, all elements with null in a group _will_ be grouped as null. This can be dangerous when large number of nulls: all go to same reducer + + ===== Pattern in Use * _Exercise_ -- Explore the correspondence of weight, height and BMI to SLG using a medium-data tool such as R, Pandas or Excel. Spoiler alert: the stereotypes of the big fat slugger is quire true. diff --git a/06-analytic_patterns-structural_operations-ordering.asciidoc b/08-ordering_patterns.asciidoc similarity index 100% rename from 06-analytic_patterns-structural_operations-ordering.asciidoc rename to 08-ordering_patterns.asciidoc diff --git a/09-statistics.asciidoc b/09-statistics.asciidoc deleted file mode 100644 index 03ee247..0000000 --- a/09-statistics.asciidoc +++ /dev/null @@ -1,370 +0,0 @@ -[[statistics]] -== Statistics - -=== Skeleton: Statistics - -Data is worthless. Actually, it's worse than worthless. It costs you money to gather, store, manage, replicate and analyze. What you really want is insight -- a relevant summary of the essential patterns in that data -- produced using relationships to analyze data in context. - -Statistical summaries are the purest form of this activity, and will be used repeatedly in the book to come, so now that you see how Hadoop is used it's a good place to focus. - -Some statistical measures let you summarize the whole from summaries of the parts: I can count all the votes in the state by summing the votes from each county, and the votes in each county by summing the votes at each polling station. Those types of aggregations -- average/standard deviation, correlation, and so forth -- are naturally scalable, but just having billions of objects introduces some practical problems you need to avoid. We'll also use them to introduce Pig, a high-level language for SQL-like queries on large datasets. - -Other statistical summaries require assembling context that grows with the size of the whole dataset. The amount of intermediate data required to count distinct objects, extract an accurate histogram, or find the median and other quantiles can become costly and cumbersome. That's especially unfortunate because so much data at large scale has a long-tail, not normal (Gaussian) distribution -- the median is far more robust indicator of the "typical" value than the average. (If Bill Gates walks into a bar, everyone in there is a billionaire on average.) - -==== These go somewhere - -.Pig Gotchas -**** - -**"dot or colon?"** - -Some late night under deadline, Pig will supply you with the absolutely baffling error message "scalar has more than one row in the output". You've gotten confused and used the tuple element operation (`players.year`) when you should have used the disambiguation operator (`players::year`). The dot is used to reference a tuple element, a common task following a `GROUP`. The double-colon is used to clarify which specific field is intended, common following a join of tables sharing a field name. - -Where to look to see that Pig is telling you have either nulls, bad fields, numbers larger than your type will hold or a misaligned schema. - -Things that used to be gotchas, but aren't, and are preserved here just through the tech review: - -* You can rename an alias, and refer to the new name: `B = A;` works. (v10) -* LIMIT is handled in the loader, and LIMIT accepts an expression (v10) -* There is an OTHERWISE (else) statement on SPLIT! v10 -* If you kill a Pig job using Ctrl-C or “kill”, Pig will now kill all associated Hadoop jobs currently running. This is applicable to both grunt mode and non-interactive mode. -* In next Pig (post-0.12.0), - - CONCAT will accept multiple args - - store can overwrite existing directory (PIG-259) - -**"Good Habits of SQL Users That Will Mess You Up in Hadoop"** - -* Group/Cogroup is king; Join is a special case -* Window functions are a recent feature -- use but don't overuse Stitch/Over. -* Everything is immutable, so you don't need and can't have transactional behavior - -TODO: fill this in with more gotchas -**** - -. A Foolish Optimization -**** -TODO: Make this be more generally "don't use the O(N) algorithm that works locally" -- fisher-yates and top-k-via-heap being two examples -TODO: consider pushing this up, earlier in the chapter, if we find a good spot for it - -We will tell you about another "optimization," mostly because we want to illustrate how a naive performance estimation based on theory can lead you astray in practice. In principle, sorting a large table in place takes 'O(N log N)' time. In a single compute node context, you can actually find the top K elements in 'O(N log K)' time -- a big savings since K is much smaller than N. What you do is maintain a heap structure; for every element past the Kth, if it is larger than the smallest element in the heap, remove the smallest member of the heap and add the element to the heap. While it is true that 'O(N log K)' beats 'O(N log N)', this reasoning is flawed in two ways. First, you are not working in a single-node context; Hadoop is going to perform that sort anyway. Second, the fixed costs of I/O almost always dominate the cost of compute (FOOTNOTE: Unless you are unjustifiably fiddling with a heap in your Mapper.) - -The 'O(log N)' portion of Hadoop's log sort shows up in two ways: The N memory sort that precedes a spill is 'O(N log N)' in compute time but less expensive than the cost of spilling the data. The true 'O(N log N)' cost comes in the reducer: 'O(log N)' merge passes, each of cost 'O(N)'. footnote:[If initial spills have M records, each merge pass combines B spills into one file, and we can skip the last merge pass, the total time is `N (log_B(N/M)-1).` [TODO: double check this]. But K is small, so there should not be multiple merge passes; the actual runtime is 'O(N)' in disk bandwidth. Avoid subtle before-the-facts reasoning about performance; run your job, count the number of merge passes, weigh your salary against the costs of the computers you are running on, and only then decide if it is worth optimizing. -**** - -=== Summary Statistics - - - -* Calculating Summary Statistics on Groups with Aggregate Functions - - COUNT_STAR(), Count Distinct, count of nulls, MIN(), MAX(), SUM(), AVG() and STDEV() - - there are a core set of aggregate functions that we use to summarize the - - Use COUNT_STAR() to count Records in a Group; MIN() and MAX() to find the single largest / smallest values in a group; SUM() to find the total of all values in a group. The built-in AVG() function returns the arithmetic mean. To find the standard deviation, use the (double check the name) function from Datafu. - - describe difference between count and count_star. Note that the number of null values is (count_star - count). Recommend to always use COUNT_STAR unless you are explicitly conveying that you want to exclude nulls. Make sure we follow that advice. - - demonstrate this for summarizing players' weight and height by year. Show a stock-market style candlestick graph of weight and of height (min, avg-STDEV, avg, avg+STDEV, max), with graph of "volume" (count, count distinct and count_star) below it. Players are getting bigger and stronger; more of them as league and roster size grows; more data (fewer nulls) after early days. - - the median is hard and so we will wait until stats chapter. - - other summary stats (kurtosis, other higher-moments), no built-in function - - nested FOREACH (in the previous chapter we found obp, slg, ops from counting stats; now do it but for career. - - Aggregating Nullable Columns (NULL values don't get counted in an average. To have them be counted, ternary NULL values into a zero) - - -Criteria for being in the structural patterns part: in remainder of book,(a) there's only one way to do it; (b) we don't talk about how it's done. - -Later or here or at all demonstrate combiners in m-r? - - -TODO: content to come - - -Join pl-yr on pk-te-yr: pl-pk-te-yr -Group ppty on pl: pl-g-pks-tes-yrs -Agg pgty on pk and yr: pl-g-tes-stadia -Flatten pgty on pk and yr: pl-te-stadia -Teammates: pl-yr to tm-yr-g-pls; cross to tm-yr-g-plaplbs; project to plas-plbs-gs - flatten to pla-plbs group to pla-g-plbs - distinct to pla-d-plbs (or pl[teammates]) - flatten to pla-plb (or teammates) - - - -Weather stations: wstn-info ws-dt-hr[obs] -Wp: art[text] art[info] art-dt-h[views] -Server Logs: ip-url-time[reqs] -UFOs: sightings[plc-dt-tm] -airports: ap-tm[alid,flid,dest,flights] - - --- Group on year; find COUNT(), count distinct, MIN(), MAX(), SUM(), AVG(), STDEV(), byte size - -SELECT - MIN(HR) AS hr_min, - MAX(HR) AS hr_max, - AVG(HR) AS hr_avg, - STDDEV_POP(HR) AS hr_stddev, - SUM(HR) AS hr_sum, - COUNT(*) AS n_recs, - COUNT(*) - COUNT(HR) AS hr_n_nulls, - COUNT(DISTINCT HR) AS hr_n_distinct -- doesn't count NULL - FROM bat_season bat -; - -SELECT - MIN(nameFirst) AS nameFirst_min, - MAX(nameFirst) AS nameFirst_max, - -- - MIN(CHAR_LENGTH(nameFirst)) AS nameFirst_strlen_min, - MAX(CHAR_LENGTH(nameFirst)) AS nameFirst_strlen_max, - MIN(OCTET_LENGTH(nameFirst)) AS nameFirst_bytesize_max, - MAX(OCTET_LENGTH(nameFirst)) AS nameFirst_bytesize_max, - AVG(CHAR_LENGTH(nameFirst)) AS nameFirst_strlen_avg, - STDDEV_POP(CHAR_LENGTH(nameFirst)) AS nameFirst_strlen_stddev, - LEFT(GROUP_CONCAT(nameFirst),25) AS nameFirst_examples, - SUM(CHAR_LENGTH(nameFirst)) AS nameFirst_strlen_sum, - -- - COUNT(*) AS n_recs, - COUNT(*) - COUNT(nameFirst) AS nameFirst_n_nulls, - COUNT(DISTINCT nameFirst) AS nameFirst_n_distinct - FROM bat_career bat -; - -SELECT - player_id, - MIN(year_id) AS yearBeg, - MAX(year_id) AS yearEnd, - COUNT(*) AS n_years, - MIN(HR) AS hr_min, - MAX(HR) AS hr_max, - AVG(HR) AS hr_avg, - STDDEV_POP(HR) AS hr_stddev, - SUM(HR) AS hr_sum, - COUNT(*) AS n_recs, - COUNT(*) - COUNT(HR) AS hr_n_nulls, - COUNT(DISTINCT HR) AS hr_n_distinct -- doesn't count NULL - FROM bat_season bat - GROUP BY player_id - ORDER BY hr_max DESC -; -==== Transpose Columns Into `field name, field value` Pairs - -Our next pattern is to transpose fields from each row into records having a column with the field name and a column with the field value, sometimes called attribute-value form. - - - - -=== Sampling - - -* Random sampling using the traditional pseudo-random number generators (which can be dangerous; we'll tell you how to do it right) (use input filename as seed) -* Consistent sampling returns a fraction of records by _key_: if a record with the key "chimpanzee" is selected into the sample, all records with that key are selected into the sample. -* (with/without replacement; weighted) -* Reservoir sampling selects a given number of records. A uniform reservoir sample with count 100, say, would return 100 records, each with the same chance of being selected, regardless of the size of the dataset. -* Subuniverse sampling selects a set of records and all associated records with it -- useful when you want to be able to joins on the sampled data, or to select a dense subgraph of a network. (TECH: is "dense subgraph" right?) -* Stratified sampling: sampling from groups/bins/strata/whatever - http://en.wikipedia.org/wiki/Stratified_sampling -* Sampling into multiple groups eg for bootstrapping -* Note that pig sample is mathematically lame (see Datafu for why) -* Note that pig sample is nice about eliminating records while loading (find out if Datafu does too) -* Warning I may have written lies about reservoir sampling make sure to review -* Spatial Sampling -* Also: generating distributions (use the random.org data set and generate a column for each dist using it) -* Expand the random.org by taking each r.o number as seed - - - -* http://blog.codinghorror.com/shuffling/ -* http://opencoursesfree.org/archived_courses/cs.berkeley.edu/~mhoemmen/cs194/Tutorials/prng.pdf - * "numbers with statistical properties of randomness. Note that I didn’t write “random numbers,” but rather, “numbers with statistical properties of randomness.”" -* Make sure you have enough bits -* Even 52 cards has 52! =~ 255 bits of permutation... can't possibly get every permutation for a table of even modest size -* Make sure you look out for ties and shuffle them as well -* Do you have to be think-y about the partitioner? -* Download about (8 years *365 days * 1 mebibyte) of randoms from random.org. This is however only 90 million 256-bit (32-byte) numbers, or 350 million 64-bit (8-byte) numbers. -* Don't just (rand mod 25) for a 1-in-25 random sample -- you'll be biased because it's not an exact number of bits. Instead reject if > 25 and try again. -* Watch out for non-reentrant rand() -- mutex or something (do we need to worry about this in hadoop?) -* http://blog.cloudera.com/blog/2013/02/how-to-resample-from-a-large-data-set-in-parallel-with-r-on-hadoop/ - * Sampling-with-replacement is the most popular method for sampling from the initial data set to produce a collection of samples for model fitting. This method is equivalent to sampling from a multinomial distribution where the probability of selecting any individual input data point is uniform over the entire data set. Unfortunately, it is not possible to sample from a multinomial distribution across a cluster without using some kind of communication between the nodes (i.e., sampling from a multinomial is not embarrassingly parallel). But do not despair: we can approximate a multinomial distribution by sampling from an identical Poisson distribution on each input data point independently, lending itself to an embarrassingly parallel implementation. - -Here's a clip from the PokerStars website (they did their homework): - -* A deck of 52 cards can be shuffled in 52! ways. 52! is about 2^225 (to be precise, 80,658,175,170,943,878,571,660,636,856,404,000,000,000,000,000 ways). We use 249 random bits from both entropy sources (user input and thermal noise) to achieve an even and unpredictable statistical distribution. -* Furthermore, we apply conservative rules to enforce the required degree of randomness; for instance, if user input does not generate required amount of entropy, we do not start the next hand until we obtain the required amount of entropy from Intel RNG. -* We use the SHA-1 cryptographic hash algorithm to mix the entropy gathered from both sources to provide an extra level of security -* We also maintain a SHA-1-based pseudo-random generator to provide even more security and protection from user data attacks -* To convert random bit stream to random numbers within a required range without bias, we use a simple and reliable algorithm. For example, if we need a random number in the range 0-25: - o we take 5 random bits and convert them to a random number 0-31 - o if this number is greater than 25 we just discard all 5 bits and repeat the process -* This method is not affected by biases related to modulus operation for generation of random numbers that are not 2n, n = 1,2,.. -* To perform an actual shuffle, we use another simple and reliable algorithm: - o first we draw a random card from the original deck (1 of 52) and place it in a new deck - now original deck contains 51 cards and the new deck contains 1 card - o then we draw another random card from the original deck (1 of 51) and place it on top of the new deck - now original deck contains 50 cards and the new deck contains 2 cards - o we repeat the process until all cards have moved from the original deck to the new deck -* This algorithm does not suffer from "Bad Distribution Of Shuffles" described in [2] - -[2] "How We Learned to Cheat at Online Poker: A Study in Software Security" - http://itmanagement.earthweb.com/entdev/article.php/616221 -[3] "The Intel Random Number Generator" - http://www.cryptography.com/resources/whitepapers/IntelRNG.pdf" - - -==== Sample Records Consistently - - ----- --- Consistent sample of events -SELECT ev.event_id, - LEFT(MD5(CONCAT(ev.game_id, ev.event_id)), 4) AS evid_hash, - ev.* - FROM events ev WHERE LEFT(MD5(CONCAT(ev.game_id, ev.event_id)), 2) = '00'; ----- - ----- --- Consistent sample of games -- all events from the game are retained --- FLO200310030 has gid_hash 0000... but evid_hash 0097 and so passes both -SELECT ev.event_id, - LEFT(MD5(ev.game_id),4) AS gid_hash, - ev.* - FROM events ev WHERE LEFT(MD5(ev.game_id),2) = '00'; ----- - -Out of 1962193 events in the 2010, 7665 expected (1/256th of the total); -got 8159 by game, 7695 by event - ----- -SELECT n_events, n_events/256, n_by_game, n_by_event - FROM - (SELECT COUNT(*) AS n_events FROM events) ev, - (SELECT COUNT(*) AS n_by_event FROM events WHERE LEFT(MD5(CONCAT(game_id,event_id)),2) = '00') ev_e, - (SELECT COUNT(*) AS n_by_game FROM events WHERE LEFT(MD5(game_id),2) = '00') ev_g - ; ----- - - -=== Generating Data - - - --- === Generating an Integers table - -DROP TABLE IF EXISTS numbers1k; -CREATE TABLE `numbers1k` ( - `idx` INT(20) UNSIGNED PRIMARY KEY AUTO_INCREMENT, - `ix0` INT(20) UNSIGNED NOT NULL DEFAULT '0', - `ixN` INT(20) UNSIGNED DEFAULT '0', - `ixS` INT(20) SIGNED NOT NULL DEFAULT '0', - `zip` INT(1) UNSIGNED NOT NULL DEFAULT '0', - `uno` INT(1) UNSIGNED NOT NULL DEFAULT '1' -) ENGINE=INNODB DEFAULT CHARSET=utf8; - -INSERT INTO numbers1k (ix0, ixN, ixS, zip, uno) -SELECT - (@row := @row + 1) - 1 AS ix0, - IF(@row=1, NULL, @row-2) AS ixN, - (@row - 500) AS ixS, - 0 AS zip, 1 AS uno - FROM -(select 0 union all select 1 union all select 3 union all select 4 union all select 5 union all select 6 union all select 6 union all select 7 union all select 8 union all select 9) t, -(select 0 union all select 1 union all select 3 union all select 4 union all select 5 union all select 6 union all select 6 union all select 7 union all select 8 union all select 9) t2, -(select 0 union all select 1 union all select 3 union all select 4 union all select 5 union all select 6 union all select 6 union all select 7 union all select 8 union all select 9) t3, -(SELECT @row:=0) r -; - -DROP TABLE IF EXISTS numbers; -CREATE TABLE `numbers` ( - `idx` INT(20) UNSIGNED PRIMARY KEY AUTO_INCREMENT, - `ix0` INT(20) UNSIGNED NOT NULL DEFAULT '0', - `ixN` INT(20) UNSIGNED DEFAULT '0', - `ixS` INT(20) SIGNED NOT NULL DEFAULT '0', - `zip` INT(1) UNSIGNED NOT NULL DEFAULT '0', - `uno` INT(1) UNSIGNED NOT NULL DEFAULT '1' -) ENGINE=INNODB DEFAULT CHARSET=utf8; - -INSERT INTO numbers (ix0, ixN, ixS, zip, uno) -SELECT - (@row := @row + 1) - 1 AS ix0, - IF(@row=1, NULL, @row-2) AS ixN, - (@row - 500000) AS ixS, - 0 AS zip, 1 AS uno -FROM -(SELECT zip FROM numbers1k) t1, -(SELECT zip FROM numbers1k) t2, -(SELECT @row:=0) r -; - - ----- - # generate 100 files of 100,000 integers each; takes about 15 seconds to run - time ruby -e '10_000_000.times.map{|num| puts num }' | gsplit -l 100000 -a 2 --additional-suffix .tsv -d - numbers - - # in mapper, read N and generate `(0 .. 99).map{|offset| 100 * N + offset }` ----- - - -==== Season leaders - --- * Selecting top-k Records within Group --- GROUP...FOREACH GENERATE TOP --- most hr season-by-season - -==== Transpose record into attribute-value pairs - -Group by season, transpose, and take the top 10 for each season, attribute pair - -=== Overflow, Underflow and other Dangers - -TODO: content to come - -=== Quantiles and Histograms - -TODO: content to come - - -In the structural operations chapter, we brought up the subject of calculating quantiles (an equal-width histogram), but postponed the discussion, judging it to be fiendishly hard. Calculating even an exact median -- the simplest case -- in a single map-reduce flow is not just hard, it's provably impossible (REF cormode paper). - -The issue is that you need to get all candidates for the edge of a bin onto the same reducer, and know the number of elements that precede the candidates on your reducer. From the mapper, however, it's impossible to know what keys to assign without knowing the global distribution -- the very thing we want to calculate! /end move to statistics) - -==== Median - ----- -SELECT COUNT(*), CEIL(COUNT(*)/2) AS midrow - FROM bat_career - ; -SELECT G, cols.* - FROM bat_career bat, - (SELECT COUNT(*) AS n_entries, CEIL(COUNT(*)/2) AS midrow FROM bat_career) cols - ORDER BY HR - LIMIT 1 OFFSET 8954 -; ----- - -==== Exact median using RANK - -Well, we've met another operation with this problem, namely the sort (ORDER BY) operation. It does a first pass to sample the global distribution of keys, then a full map-reduce to place ordered values on the same reducer. Its numerate younger brother, RANK, will do what we need. The quartiles -- the boundaries of the four bins bins each holding 25% of the values -- ... - -(Show using RANK and then filter; use the "pre-inject and assert global values" trick for the bin size. Handle the detail of needing to average two values when boundary splits an index, eg median of a table with even number of rows) - -==== Approximate median & quantiles using DataFu - (get better title) - - -=== Algebraic vs Holistic Aggregations - -TODO: content to come - -=== "Sketching" Algorithms - -TODO: content to come - -* For each entry, calculate N hashes -* increment each bucket -* count cannot be more than value of smallest bucket - -Even if bucket for `zephyr` collides with the bucket for `also` on one of the hashes, it's exceedingly unlikely to collide with it on all the - -Related to Bloom Filter - - -=== Exercises - -Distributions: - -* First letter of Wikipedia article titles - -* Count of inbound links for wikipedia articles - -* Total sum of pageviews counts for each page diff --git a/06-analytic_patterns-structural_operations-uniquing.asciidoc b/09-uniquing_patterns.asciidoc similarity index 100% rename from 06-analytic_patterns-structural_operations-uniquing.asciidoc rename to 09-uniquing_patterns.asciidoc diff --git a/10-advanced_patterns.asciidoc b/10-advanced_patterns.asciidoc index 398db23..9eb4be4 100644 --- a/10-advanced_patterns.asciidoc +++ b/10-advanced_patterns.asciidoc @@ -1178,3 +1178,349 @@ pairs_r = FOREACH (GROUP raw BY client_ip) { ------ +[[statistics]] +== Statistics + +=== Skeleton: Statistics + +Data is worthless. Actually, it's worse than worthless. It costs you money to gather, store, manage, replicate and analyze. What you really want is insight -- a relevant summary of the essential patterns in that data -- produced using relationships to analyze data in context. + +Statistical summaries are the purest form of this activity, and will be used repeatedly in the book to come, so now that you see how Hadoop is used it's a good place to focus. + +Some statistical measures let you summarize the whole from summaries of the parts: I can count all the votes in the state by summing the votes from each county, and the votes in each county by summing the votes at each polling station. Those types of aggregations -- average/standard deviation, correlation, and so forth -- are naturally scalable, but just having billions of objects introduces some practical problems you need to avoid. We'll also use them to introduce Pig, a high-level language for SQL-like queries on large datasets. + +Other statistical summaries require assembling context that grows with the size of the whole dataset. The amount of intermediate data required to count distinct objects, extract an accurate histogram, or find the median and other quantiles can become costly and cumbersome. That's especially unfortunate because so much data at large scale has a long-tail, not normal (Gaussian) distribution -- the median is far more robust indicator of the "typical" value than the average. (If Bill Gates walks into a bar, everyone in there is a billionaire on average.) + +==== These go somewhere + +.Pig Gotchas +**** + +**"dot or colon?"** + +Some late night under deadline, Pig will supply you with the absolutely baffling error message "scalar has more than one row in the output". You've gotten confused and used the tuple element operation (`players.year`) when you should have used the disambiguation operator (`players::year`). The dot is used to reference a tuple element, a common task following a `GROUP`. The double-colon is used to clarify which specific field is intended, common following a join of tables sharing a field name. + +Where to look to see that Pig is telling you have either nulls, bad fields, numbers larger than your type will hold or a misaligned schema. + +Things that used to be gotchas, but aren't, and are preserved here just through the tech review: + +* You can rename an alias, and refer to the new name: `B = A;` works. (v10) +* LIMIT is handled in the loader, and LIMIT accepts an expression (v10) +* There is an OTHERWISE (else) statement on SPLIT! v10 +* If you kill a Pig job using Ctrl-C or “kill”, Pig will now kill all associated Hadoop jobs currently running. This is applicable to both grunt mode and non-interactive mode. +* In next Pig (post-0.12.0), + - CONCAT will accept multiple args + - store can overwrite existing directory (PIG-259) + +**"Good Habits of SQL Users That Will Mess You Up in Hadoop"** + +* Group/Cogroup is king; Join is a special case +* Window functions are a recent feature -- use but don't overuse Stitch/Over. +* Everything is immutable, so you don't need and can't have transactional behavior + +TODO: fill this in with more gotchas +**** + +. A Foolish Optimization +**** +TODO: Make this be more generally "don't use the O(N) algorithm that works locally" -- fisher-yates and top-k-via-heap being two examples +TODO: consider pushing this up, earlier in the chapter, if we find a good spot for it + +We will tell you about another "optimization," mostly because we want to illustrate how a naive performance estimation based on theory can lead you astray in practice. In principle, sorting a large table in place takes 'O(N log N)' time. In a single compute node context, you can actually find the top K elements in 'O(N log K)' time -- a big savings since K is much smaller than N. What you do is maintain a heap structure; for every element past the Kth, if it is larger than the smallest element in the heap, remove the smallest member of the heap and add the element to the heap. While it is true that 'O(N log K)' beats 'O(N log N)', this reasoning is flawed in two ways. First, you are not working in a single-node context; Hadoop is going to perform that sort anyway. Second, the fixed costs of I/O almost always dominate the cost of compute (FOOTNOTE: Unless you are unjustifiably fiddling with a heap in your Mapper.) + +The 'O(log N)' portion of Hadoop's log sort shows up in two ways: The N memory sort that precedes a spill is 'O(N log N)' in compute time but less expensive than the cost of spilling the data. The true 'O(N log N)' cost comes in the reducer: 'O(log N)' merge passes, each of cost 'O(N)'. footnote:[If initial spills have M records, each merge pass combines B spills into one file, and we can skip the last merge pass, the total time is `N (log_B(N/M)-1).` [TODO: double check this]. But K is small, so there should not be multiple merge passes; the actual runtime is 'O(N)' in disk bandwidth. Avoid subtle before-the-facts reasoning about performance; run your job, count the number of merge passes, weigh your salary against the costs of the computers you are running on, and only then decide if it is worth optimizing. +**** + +=== Summary Statistics + + + +* Calculating Summary Statistics on Groups with Aggregate Functions + - COUNT_STAR(), Count Distinct, count of nulls, MIN(), MAX(), SUM(), AVG() and STDEV() + - there are a core set of aggregate functions that we use to summarize the + - Use COUNT_STAR() to count Records in a Group; MIN() and MAX() to find the single largest / smallest values in a group; SUM() to find the total of all values in a group. The built-in AVG() function returns the arithmetic mean. To find the standard deviation, use the (double check the name) function from Datafu. + - describe difference between count and count_star. Note that the number of null values is (count_star - count). Recommend to always use COUNT_STAR unless you are explicitly conveying that you want to exclude nulls. Make sure we follow that advice. + - demonstrate this for summarizing players' weight and height by year. Show a stock-market style candlestick graph of weight and of height (min, avg-STDEV, avg, avg+STDEV, max), with graph of "volume" (count, count distinct and count_star) below it. Players are getting bigger and stronger; more of them as league and roster size grows; more data (fewer nulls) after early days. + - the median is hard and so we will wait until stats chapter. + - other summary stats (kurtosis, other higher-moments), no built-in function + - nested FOREACH (in the previous chapter we found obp, slg, ops from counting stats; now do it but for career. + - Aggregating Nullable Columns (NULL values don't get counted in an average. To have them be counted, ternary NULL values into a zero) + + +Criteria for being in the structural patterns part: in remainder of book,(a) there's only one way to do it; (b) we don't talk about how it's done. + +Later or here or at all demonstrate combiners in m-r? + +TODO: content to come + +Join pl-yr on pk-te-yr: pl-pk-te-yr +Group ppty on pl: pl-g-pks-tes-yrs +Agg pgty on pk and yr: pl-g-tes-stadia +Flatten pgty on pk and yr: pl-te-stadia +Teammates: pl-yr to tm-yr-g-pls; cross to tm-yr-g-plaplbs; project to plas-plbs-gs + flatten to pla-plbs group to pla-g-plbs + distinct to pla-d-plbs (or pl[teammates]) + flatten to pla-plb (or teammates) + + + +Weather stations: wstn-info ws-dt-hr[obs] +Wp: art[text] art[info] art-dt-h[views] +Server Logs: ip-url-time[reqs] +UFOs: sightings[plc-dt-tm] +airports: ap-tm[alid,flid,dest,flights] + + +-- Group on year; find COUNT(), count distinct, MIN(), MAX(), SUM(), AVG(), STDEV(), byte size + +SELECT + MIN(HR) AS hr_min, + MAX(HR) AS hr_max, + AVG(HR) AS hr_avg, + STDDEV_POP(HR) AS hr_stddev, + SUM(HR) AS hr_sum, + COUNT(*) AS n_recs, + COUNT(*) - COUNT(HR) AS hr_n_nulls, + COUNT(DISTINCT HR) AS hr_n_distinct -- doesn't count NULL + FROM bat_season bat +; + +SELECT + MIN(nameFirst) AS nameFirst_min, + MAX(nameFirst) AS nameFirst_max, + -- + MIN(CHAR_LENGTH(nameFirst)) AS nameFirst_strlen_min, + MAX(CHAR_LENGTH(nameFirst)) AS nameFirst_strlen_max, + MIN(OCTET_LENGTH(nameFirst)) AS nameFirst_bytesize_max, + MAX(OCTET_LENGTH(nameFirst)) AS nameFirst_bytesize_max, + AVG(CHAR_LENGTH(nameFirst)) AS nameFirst_strlen_avg, + STDDEV_POP(CHAR_LENGTH(nameFirst)) AS nameFirst_strlen_stddev, + LEFT(GROUP_CONCAT(nameFirst),25) AS nameFirst_examples, + SUM(CHAR_LENGTH(nameFirst)) AS nameFirst_strlen_sum, + -- + COUNT(*) AS n_recs, + COUNT(*) - COUNT(nameFirst) AS nameFirst_n_nulls, + COUNT(DISTINCT nameFirst) AS nameFirst_n_distinct + FROM bat_career bat +; + +SELECT + player_id, + MIN(year_id) AS yearBeg, + MAX(year_id) AS yearEnd, + COUNT(*) AS n_years, + MIN(HR) AS hr_min, + MAX(HR) AS hr_max, + AVG(HR) AS hr_avg, + STDDEV_POP(HR) AS hr_stddev, + SUM(HR) AS hr_sum, + COUNT(*) AS n_recs, + COUNT(*) - COUNT(HR) AS hr_n_nulls, + COUNT(DISTINCT HR) AS hr_n_distinct -- doesn't count NULL + FROM bat_season bat + GROUP BY player_id + ORDER BY hr_max DESC +; +==== Transpose Columns Into `field name, field value` Pairs + +Our next pattern is to transpose fields from each row into records having a column with the field name and a column with the field value, sometimes called attribute-value form. + +=== Sampling + + +* Random sampling using the traditional pseudo-random number generators (which can be dangerous; we'll tell you how to do it right) (use input filename as seed) +* Consistent sampling returns a fraction of records by _key_: if a record with the key "chimpanzee" is selected into the sample, all records with that key are selected into the sample. +* (with/without replacement; weighted) +* Reservoir sampling selects a given number of records. A uniform reservoir sample with count 100, say, would return 100 records, each with the same chance of being selected, regardless of the size of the dataset. +* Subuniverse sampling selects a set of records and all associated records with it -- useful when you want to be able to joins on the sampled data, or to select a dense subgraph of a network. (TECH: is "dense subgraph" right?) +* Stratified sampling: sampling from groups/bins/strata/whatever - http://en.wikipedia.org/wiki/Stratified_sampling +* Sampling into multiple groups eg for bootstrapping +* Note that pig sample is mathematically lame (see Datafu for why) +* Note that pig sample is nice about eliminating records while loading (find out if Datafu does too) +* Warning I may have written lies about reservoir sampling make sure to review +* Spatial Sampling +* Also: generating distributions (use the random.org data set and generate a column for each dist using it) +* Expand the random.org by taking each r.o number as seed + + + +* http://blog.codinghorror.com/shuffling/ +* http://opencoursesfree.org/archived_courses/cs.berkeley.edu/~mhoemmen/cs194/Tutorials/prng.pdf + * "numbers with statistical properties of randomness. Note that I didn’t write “random numbers,” but rather, “numbers with statistical properties of randomness.”" +* Make sure you have enough bits +* Even 52 cards has 52! =~ 255 bits of permutation... can't possibly get every permutation for a table of even modest size +* Make sure you look out for ties and shuffle them as well +* Do you have to be think-y about the partitioner? +* Download about (8 years *365 days * 1 mebibyte) of randoms from random.org. This is however only 90 million 256-bit (32-byte) numbers, or 350 million 64-bit (8-byte) numbers. +* Don't just (rand mod 25) for a 1-in-25 random sample -- you'll be biased because it's not an exact number of bits. Instead reject if > 25 and try again. +* Watch out for non-reentrant rand() -- mutex or something (do we need to worry about this in hadoop?) +* http://blog.cloudera.com/blog/2013/02/how-to-resample-from-a-large-data-set-in-parallel-with-r-on-hadoop/ + * Sampling-with-replacement is the most popular method for sampling from the initial data set to produce a collection of samples for model fitting. This method is equivalent to sampling from a multinomial distribution where the probability of selecting any individual input data point is uniform over the entire data set. Unfortunately, it is not possible to sample from a multinomial distribution across a cluster without using some kind of communication between the nodes (i.e., sampling from a multinomial is not embarrassingly parallel). But do not despair: we can approximate a multinomial distribution by sampling from an identical Poisson distribution on each input data point independently, lending itself to an embarrassingly parallel implementation. + +Here's a clip from the PokerStars website (they did their homework): + +* A deck of 52 cards can be shuffled in 52! ways. 52! is about 2^225 (to be precise, 80,658,175,170,943,878,571,660,636,856,404,000,000,000,000,000 ways). We use 249 random bits from both entropy sources (user input and thermal noise) to achieve an even and unpredictable statistical distribution. +* Furthermore, we apply conservative rules to enforce the required degree of randomness; for instance, if user input does not generate required amount of entropy, we do not start the next hand until we obtain the required amount of entropy from Intel RNG. +* We use the SHA-1 cryptographic hash algorithm to mix the entropy gathered from both sources to provide an extra level of security +* We also maintain a SHA-1-based pseudo-random generator to provide even more security and protection from user data attacks +* To convert random bit stream to random numbers within a required range without bias, we use a simple and reliable algorithm. For example, if we need a random number in the range 0-25: + o we take 5 random bits and convert them to a random number 0-31 + o if this number is greater than 25 we just discard all 5 bits and repeat the process +* This method is not affected by biases related to modulus operation for generation of random numbers that are not 2n, n = 1,2,.. +* To perform an actual shuffle, we use another simple and reliable algorithm: + o first we draw a random card from the original deck (1 of 52) and place it in a new deck - now original deck contains 51 cards and the new deck contains 1 card + o then we draw another random card from the original deck (1 of 51) and place it on top of the new deck - now original deck contains 50 cards and the new deck contains 2 cards + o we repeat the process until all cards have moved from the original deck to the new deck +* This algorithm does not suffer from "Bad Distribution Of Shuffles" described in [2] + +[2] "How We Learned to Cheat at Online Poker: A Study in Software Security" - http://itmanagement.earthweb.com/entdev/article.php/616221 +[3] "The Intel Random Number Generator" - http://www.cryptography.com/resources/whitepapers/IntelRNG.pdf" + + +==== Sample Records Consistently + + +---- +-- Consistent sample of events +SELECT ev.event_id, + LEFT(MD5(CONCAT(ev.game_id, ev.event_id)), 4) AS evid_hash, + ev.* + FROM events ev WHERE LEFT(MD5(CONCAT(ev.game_id, ev.event_id)), 2) = '00'; +---- + +---- +-- Consistent sample of games -- all events from the game are retained +-- FLO200310030 has gid_hash 0000... but evid_hash 0097 and so passes both +SELECT ev.event_id, + LEFT(MD5(ev.game_id),4) AS gid_hash, + ev.* + FROM events ev WHERE LEFT(MD5(ev.game_id),2) = '00'; +---- + +Out of 1962193 events in the 2010, 7665 expected (1/256th of the total); +got 8159 by game, 7695 by event + +---- +SELECT n_events, n_events/256, n_by_game, n_by_event + FROM + (SELECT COUNT(*) AS n_events FROM events) ev, + (SELECT COUNT(*) AS n_by_event FROM events WHERE LEFT(MD5(CONCAT(game_id,event_id)),2) = '00') ev_e, + (SELECT COUNT(*) AS n_by_game FROM events WHERE LEFT(MD5(game_id),2) = '00') ev_g + ; +---- + +=== Generating Data + +-- === Generating an Integers table + +DROP TABLE IF EXISTS numbers1k; +CREATE TABLE `numbers1k` ( + `idx` INT(20) UNSIGNED PRIMARY KEY AUTO_INCREMENT, + `ix0` INT(20) UNSIGNED NOT NULL DEFAULT '0', + `ixN` INT(20) UNSIGNED DEFAULT '0', + `ixS` INT(20) SIGNED NOT NULL DEFAULT '0', + `zip` INT(1) UNSIGNED NOT NULL DEFAULT '0', + `uno` INT(1) UNSIGNED NOT NULL DEFAULT '1' +) ENGINE=INNODB DEFAULT CHARSET=utf8; + +INSERT INTO numbers1k (ix0, ixN, ixS, zip, uno) +SELECT + (@row := @row + 1) - 1 AS ix0, + IF(@row=1, NULL, @row-2) AS ixN, + (@row - 500) AS ixS, + 0 AS zip, 1 AS uno + FROM +(select 0 union all select 1 union all select 3 union all select 4 union all select 5 union all select 6 union all select 6 union all select 7 union all select 8 union all select 9) t, +(select 0 union all select 1 union all select 3 union all select 4 union all select 5 union all select 6 union all select 6 union all select 7 union all select 8 union all select 9) t2, +(select 0 union all select 1 union all select 3 union all select 4 union all select 5 union all select 6 union all select 6 union all select 7 union all select 8 union all select 9) t3, +(SELECT @row:=0) r +; + +DROP TABLE IF EXISTS numbers; +CREATE TABLE `numbers` ( + `idx` INT(20) UNSIGNED PRIMARY KEY AUTO_INCREMENT, + `ix0` INT(20) UNSIGNED NOT NULL DEFAULT '0', + `ixN` INT(20) UNSIGNED DEFAULT '0', + `ixS` INT(20) SIGNED NOT NULL DEFAULT '0', + `zip` INT(1) UNSIGNED NOT NULL DEFAULT '0', + `uno` INT(1) UNSIGNED NOT NULL DEFAULT '1' +) ENGINE=INNODB DEFAULT CHARSET=utf8; + +INSERT INTO numbers (ix0, ixN, ixS, zip, uno) +SELECT + (@row := @row + 1) - 1 AS ix0, + IF(@row=1, NULL, @row-2) AS ixN, + (@row - 500000) AS ixS, + 0 AS zip, 1 AS uno +FROM +(SELECT zip FROM numbers1k) t1, +(SELECT zip FROM numbers1k) t2, +(SELECT @row:=0) r +; + + +---- + # generate 100 files of 100,000 integers each; takes about 15 seconds to run + time ruby -e '10_000_000.times.map{|num| puts num }' | gsplit -l 100000 -a 2 --additional-suffix .tsv -d - numbers + + # in mapper, read N and generate `(0 .. 99).map{|offset| 100 * N + offset }` +---- + + +==== Season leaders + +-- * Selecting top-k Records within Group +-- GROUP...FOREACH GENERATE TOP +-- most hr season-by-season + +==== Transpose record into attribute-value pairs + +Group by season, transpose, and take the top 10 for each season, attribute pair + +=== Overflow, Underflow and other Dangers + +TODO: content to come + +=== Quantiles and Histograms + +TODO: content to come + + +In the structural operations chapter, we brought up the subject of calculating quantiles (an equal-width histogram), but postponed the discussion, judging it to be fiendishly hard. Calculating even an exact median -- the simplest case -- in a single map-reduce flow is not just hard, it's provably impossible (REF cormode paper). + +The issue is that you need to get all candidates for the edge of a bin onto the same reducer, and know the number of elements that precede the candidates on your reducer. From the mapper, however, it's impossible to know what keys to assign without knowing the global distribution -- the very thing we want to calculate! /end move to statistics) + +==== Median + +---- +SELECT COUNT(*), CEIL(COUNT(*)/2) AS midrow + FROM bat_career + ; +SELECT G, cols.* + FROM bat_career bat, + (SELECT COUNT(*) AS n_entries, CEIL(COUNT(*)/2) AS midrow FROM bat_career) cols + ORDER BY HR + LIMIT 1 OFFSET 8954 +; +---- + +==== Exact median using RANK + +Well, we've met another operation with this problem, namely the sort (ORDER BY) operation. It does a first pass to sample the global distribution of keys, then a full map-reduce to place ordered values on the same reducer. Its numerate younger brother, RANK, will do what we need. The quartiles -- the boundaries of the four bins bins each holding 25% of the values -- ... + +(Show using RANK and then filter; use the "pre-inject and assert global values" trick for the bin size. Handle the detail of needing to average two values when boundary splits an index, eg median of a table with even number of rows) + +==== Approximate median & quantiles using DataFu + (get better title) + + + +=== Exercises + +Distributions: + +* First letter of Wikipedia article titles + +* Count of inbound links for wikipedia articles + +* Total sum of pageviews counts for each page diff --git a/10a-event_streams-more.asciidoc b/10a-event_streams-more.asciidoc deleted file mode 100644 index 39a4fe8..0000000 --- a/10a-event_streams-more.asciidoc +++ /dev/null @@ -1,140 +0,0 @@ -////This intro here is a good model for other chapters - this one's rough, but the bones are here. Amy//// - -Much of Hadoop's adoption is driven by organizations realizing they the opportunity to measure every aspect of their operation, unify those data sources, and act on the patterns that Hadoop and other Big Data tools uncover.////Share a few examples of things that can be measured (website hits, etc.) Amy//// - -For - -e-commerce site, an advertising broker, or a hosting provider, there's no wonder inherent in being able to measure every customer interaction, no controversy that it's enormously valuable to uncovering patterns in those interactions, and no lack of tools to act on those patterns in real time. - -can use the clickstream of interactions with each email -; this was one of the cardinal advantages cited in the success of Barack Obama's 2012 campaign. - - -This chapter's techniques will help, say, a hospital process the stream of data from every ICU patient; a retailer process the entire purchase-decision process -from -or a political campaign to understand and tune -the response to -each email batch and advertising placement. - - -Hadoop cut its teeth at Yahoo, where it was primarily used for processing internet-sized web crawls(see next chapter on text processing) and - -// ?? maybe this should just be 'data streams' or something - - -Quite likely, server log processing either a) is the reason you got this book or b) seems utterly boring.////I think you should explain this for readers. Amy//// For the latter folks, stick with it; hidden in this chapter are basic problems of statistics (finding histogram of pageviews), text processing (regular expressions for parsing), and graphs (constructing the tree of paths that users take through a website). - -=== Pageview Histograms === -////Ease the reader in with something like, "Our goal here will be..." Amy//// - -Let's start exploring the dataset. Andy Baio - ----- -include::code/serverlogs/old/logline-02-histograms-mapper.rb[] ----- - -We want to group on `date_hr`, so just add a 'virtual accessor' -- a method that behaves like an attribute but derives its value from another field: - ----- -include::code/serverlogs/old/logline-00-model-date_hr.rb[] ----- - -This is the advantage of having a model and not just a passive sack of data. - -Run it in map mode: - ----- -include::code/serverlogs/old/logline-02-histograms-02-mapper-wu-lign-sort.log[] ----- - -TODO: digression about `wu-lign`. - -Sort and save the map output; then write and debug your reducer. - ----- -include::code/serverlogs/old/logline-02-histograms-full.rb[] ----- - -When things are working, this is what you'll see. Notice that the `.../Star_Wars_Kid.wmv` file already have five times the pageviews as the site root (`/`). - ----- -include::code/serverlogs/old/logline-02-histograms-03-reduce.log[] ----- - -You're ready to run the script in the cloud! Fire it off and you'll see dozens of workers start processing the data. - ----- -include::code/serverlogs/old/logline-02-histograms-04-freals.log[] ----- - - -=== User Paths through the site ("Sessionizing") - -We can use the user logs to assemble a picture of how users navigate the site -- 'sessionizing' their pageviews. Marketing and e-commerce sites have a great deal of interest in optimizing their "conversion funnel", the sequence of pages that visitors follow before filling out a contact form, or buying those shoes, or whatever it is the site exists to serve. Visitor sessions are also useful for defining groups of related pages, in a manner far more robust than what simple page-to-page links would define. A recommendation algorithm using those relations would for example help an e-commerce site recommend teflon paste to a visitor buying plumbing fittings, or help a news site recommend an article about Marilyn Monroe to a visitor who has just finished reading an article about John F Kennedy. Many commercial web analytics tools don't offer a view into user sessions -- assembling them is extremely challenging for a traditional datastore. It's a piece of cake for Hadoop, as you're about to see. - -////This spot could be an effective place to say more about "Locality" and taking the reader deeper into thinking about that concept in context. Amy//// - -NOTE:[Take a moment and think about the locality: what feature(s) do we need to group on? What additional feature(s) should we sort with?] - - -spit out `[ip, date_hr, visit_time, path]`. - ----- -include::code/serverlogs/old/logline-03-breadcrumbs-full.rb[] ----- - -You might ask why we don't partition directly on say both `visitor_id` and date (or other time bucket). Partitioning by date would break the locality of any visitor session that crossed midnight: some of the requests would be in one day, the rest would be in the next day. - -run it in map mode: - ----- -include::code/serverlogs/old/logline-02-histograms-01-mapper.log[] ----- - ----- -include::code/serverlogs/old/logline-03-breadcrumbs-02-mapper.log[] ----- - -group on user - ----- -include::code/serverlogs/old/logline-03-breadcrumbs-03-reducer.log[] ----- - -We use the secondary sort so that each visit is in strict order of time within a session. - -You might ask why that is necessary -- surely each mapper reads the lines in order? Yes, but you have no control over what order the mappers run, or where their input begins and ends. - -This script will accumulate multiple visits of a page. - -TODO: say more about the secondary sort. -////This may sound wacky, but please try it out: use the JFK/MM exmaple again, here. Tie this all together more, the concepts, using those memorable people. I can explain this live, too. Amy//// - -==== Web-crawlers and the Skew Problem ==== - -In a - -It's important to use real data when you're testing algorithms: -a skew problem like this - -=== Page-Page similarity - -What can you do with the sessionized logs? Well, each row lists a visitor-session on the left and a bunch of pages on the right. We've been thinking about that as a table, but it's also a graph -- actually, a bunch of graphs! The <> describes an _affinity graph_, but we can build a simpler graph that just connects pages to pages by counting the number of times a pair of pages were visited by the same session. Every time a person requests the `/archive/2003/04/03/typo_pop.shtml` page _and_ the `/archive/2003/04/29/star_war.shtml` page in the same visit, that's one point towards their similarity. The chapter on <> has lots of fun things to do with a graph like this, so for now we'll just lay the groundwork by computing the page-page similarity graph defined by visitor sessions. - ----- -include::code/serverlogs/old/logline-04-page_page_edges-full.rb[] ----- - ----- -include::code/serverlogs/old/logline-04-page_page_edges-03-reducer.log[] ----- - -[[serverlogs_affinity_graph]] -.Affinity Graph -**** -First, you can think of it as an _affinity graph_ pairing visitor sessions with pages. The Netflix prize motivated a lot of work to help us understand affinity graphs -- in that case, a pairing of Netflix users with movies. Affinity graph-based recommendation engines simultaneously group similar users with similar movies (or similar sessions with similar pages). Imagine the following device. Set up two long straight lengths of wire, with beads that can slide along it. Beads on the left represent visitor sessions, ones on the right represent pages. These are magic beads, in that they can slide through each other, and they can clamp themselves to the wire. They are also slightly magnetic, so with no other tension they would not clump together but instead arrange themselves at some separated interval along the wire. Clamp all the beads in place for a moment and tie a small elastic string between each session bead and each page in that session. (These elastic bands also magically don't interfere with each other). To combat the crawler-robot effect, choose tighter strings when there are few pages in the session, and weaker strings when there are lots of pages in the session. Once you've finished stringing this up, unclamp one of the session beads. It will snap to a position opposit the middle of all the pages it is tied to. If you now unclamp each of those page beads, they'll move to sit opposite that first session bead. As you continue to unclamp all the beads, you'll find that they organize into clumps along the wire: when a bunch of sessions link to a common set of pages, their mutal forces combine to drag them opposite each other. That's the intuitive view; there are proper mathematical treatments, of course, for kind of co-clustering. - ----- -TODO: figure showing bipartite session-page graph ----- -**** diff --git a/07-part_three.asciidoc b/11-3-part_three-applications.asciidoc similarity index 100% rename from 07-part_three.asciidoc rename to 11-3-part_three-applications.asciidoc diff --git a/10-event_streams.asciidoc b/11-event_streams.asciidoc similarity index 57% rename from 10-event_streams.asciidoc rename to 11-event_streams.asciidoc index 119d69b..fec6e5a 100644 --- a/10-event_streams.asciidoc +++ b/11-event_streams.asciidoc @@ -2,6 +2,33 @@ == Event Streams == +////This intro here is a good model for other chapters - this one's rough, but the bones are here. Amy//// + +Much of Hadoop's adoption is driven by organizations realizing they the opportunity to measure every aspect of their operation, unify those data sources, and act on the patterns that Hadoop and other Big Data tools uncover.////Share a few examples of things that can be measured (website hits, etc.) Amy//// + +For + +e-commerce site, an advertising broker, or a hosting provider, there's no wonder inherent in being able to measure every customer interaction, no controversy that it's enormously valuable to uncovering patterns in those interactions, and no lack of tools to act on those patterns in real time. + +can use the clickstream of interactions with each email +; this was one of the cardinal advantages cited in the success of Barack Obama's 2012 campaign. + + +This chapter's techniques will help, say, a hospital process the stream of data from every ICU patient; a retailer process the entire purchase-decision process +from +or a political campaign to understand and tune +the response to +each email batch and advertising placement. + + +Hadoop cut its teeth at Yahoo, where it was primarily used for processing internet-sized web crawls(see next chapter on text processing) and + +// ?? maybe this should just be 'data streams' or something + + +Quite likely, server log processing either a) is the reason you got this book or b) seems utterly boring.////I think you should explain this for readers. Amy//// For the latter folks, stick with it; hidden in this chapter are basic problems of statistics (finding histogram of pageviews), text processing (regular expressions for parsing), and graphs (constructing the tree of paths that users take through a website). + + === Webserver Log Parsing === We'll represent loglines with the following <>: @@ -82,6 +109,118 @@ TODO ---- +=== Pageview Histograms === +////Ease the reader in with something like, "Our goal here will be..." Amy//// + +Let's start exploring the dataset. Andy Baio + +---- +include::code/serverlogs/old/logline-02-histograms-mapper.rb[] +---- + +We want to group on `date_hr`, so just add a 'virtual accessor' -- a method that behaves like an attribute but derives its value from another field: + +---- +include::code/serverlogs/old/logline-00-model-date_hr.rb[] +---- + +This is the advantage of having a model and not just a passive sack of data. + +Run it in map mode: + +---- +include::code/serverlogs/old/logline-02-histograms-02-mapper-wu-lign-sort.log[] +---- + +TODO: digression about `wu-lign`. + +Sort and save the map output; then write and debug your reducer. + +---- +include::code/serverlogs/old/logline-02-histograms-full.rb[] +---- + +When things are working, this is what you'll see. Notice that the `.../Star_Wars_Kid.wmv` file already have five times the pageviews as the site root (`/`). + +---- +include::code/serverlogs/old/logline-02-histograms-03-reduce.log[] +---- + +You're ready to run the script in the cloud! Fire it off and you'll see dozens of workers start processing the data. + +---- +include::code/serverlogs/old/logline-02-histograms-04-freals.log[] +---- + + +=== User Paths through the site ("Sessionizing") + +We can use the user logs to assemble a picture of how users navigate the site -- 'sessionizing' their pageviews. Marketing and e-commerce sites have a great deal of interest in optimizing their "conversion funnel", the sequence of pages that visitors follow before filling out a contact form, or buying those shoes, or whatever it is the site exists to serve. Visitor sessions are also useful for defining groups of related pages, in a manner far more robust than what simple page-to-page links would define. A recommendation algorithm using those relations would for example help an e-commerce site recommend teflon paste to a visitor buying plumbing fittings, or help a news site recommend an article about Marilyn Monroe to a visitor who has just finished reading an article about John F Kennedy. Many commercial web analytics tools don't offer a view into user sessions -- assembling them is extremely challenging for a traditional datastore. It's a piece of cake for Hadoop, as you're about to see. + +////This spot could be an effective place to say more about "Locality" and taking the reader deeper into thinking about that concept in context. Amy//// + +NOTE:[Take a moment and think about the locality: what feature(s) do we need to group on? What additional feature(s) should we sort with?] + + +spit out `[ip, date_hr, visit_time, path]`. + +---- +include::code/serverlogs/old/logline-03-breadcrumbs-full.rb[] +---- + +You might ask why we don't partition directly on say both `visitor_id` and date (or other time bucket). Partitioning by date would break the locality of any visitor session that crossed midnight: some of the requests would be in one day, the rest would be in the next day. + +run it in map mode: + +---- +include::code/serverlogs/old/logline-02-histograms-01-mapper.log[] +---- + +---- +include::code/serverlogs/old/logline-03-breadcrumbs-02-mapper.log[] +---- + +group on user + +---- +include::code/serverlogs/old/logline-03-breadcrumbs-03-reducer.log[] +---- + +We use the secondary sort so that each visit is in strict order of time within a session. + +You might ask why that is necessary -- surely each mapper reads the lines in order? Yes, but you have no control over what order the mappers run, or where their input begins and ends. + +This script will accumulate multiple visits of a page. + +TODO: say more about the secondary sort. +////This may sound wacky, but please try it out: use the JFK/MM exmaple again, here. Tie this all together more, the concepts, using those memorable people. I can explain this live, too. Amy//// + +==== Web-crawlers and the Skew Problem ==== + +In a + +It's important to use real data when you're testing algorithms: +a skew problem like this + +=== Page-Page similarity + +What can you do with the sessionized logs? Well, each row lists a visitor-session on the left and a bunch of pages on the right. We've been thinking about that as a table, but it's also a graph -- actually, a bunch of graphs! The <> describes an _affinity graph_, but we can build a simpler graph that just connects pages to pages by counting the number of times a pair of pages were visited by the same session. Every time a person requests the `/archive/2003/04/03/typo_pop.shtml` page _and_ the `/archive/2003/04/29/star_war.shtml` page in the same visit, that's one point towards their similarity. The chapter on <> has lots of fun things to do with a graph like this, so for now we'll just lay the groundwork by computing the page-page similarity graph defined by visitor sessions. + +---- +include::code/serverlogs/old/logline-04-page_page_edges-full.rb[] +---- + +---- +include::code/serverlogs/old/logline-04-page_page_edges-03-reducer.log[] +---- + +[[serverlogs_affinity_graph]] +.Affinity Graph +**** +First, you can think of it as an _affinity graph_ pairing visitor sessions with pages. The Netflix prize motivated a lot of work to help us understand affinity graphs -- in that case, a pairing of Netflix users with movies. Affinity graph-based recommendation engines simultaneously group similar users with similar movies (or similar sessions with similar pages). Imagine the following device. Set up two long straight lengths of wire, with beads that can slide along it. Beads on the left represent visitor sessions, ones on the right represent pages. These are magic beads, in that they can slide through each other, and they can clamp themselves to the wire. They are also slightly magnetic, so with no other tension they would not clump together but instead arrange themselves at some separated interval along the wire. Clamp all the beads in place for a moment and tie a small elastic string between each session bead and each page in that session. (These elastic bands also magically don't interfere with each other). To combat the crawler-robot effect, choose tighter strings when there are few pages in the session, and weaker strings when there are lots of pages in the session. Once you've finished stringing this up, unclamp one of the session beads. It will snap to a position opposit the middle of all the pages it is tied to. If you now unclamp each of those page beads, they'll move to sit opposite that first session bead. As you continue to unclamp all the beads, you'll find that they organize into clumps along the wire: when a bunch of sessions link to a common set of pages, their mutal forces combine to drag them opposite each other. That's the intuitive view; there are proper mathematical treatments, of course, for kind of co-clustering. + +TODO: figure showing bipartite session-page graph +**** === Geo-IP Matching === diff --git a/11-geographic.asciidoc b/12-geospatial_analysis.asciidoc similarity index 100% rename from 11-geographic.asciidoc rename to 12-geospatial_analysis.asciidoc diff --git a/12-placeholder.asciidoc b/12-placeholder.asciidoc deleted file mode 100644 index daa9912..0000000 --- a/12-placeholder.asciidoc +++ /dev/null @@ -1 +0,0 @@ -== Placeholder diff --git a/11a-geodata-intro.asciidoc b/12a-geospatial-intro.asciidoc similarity index 100% rename from 11a-geodata-intro.asciidoc rename to 12a-geospatial-intro.asciidoc diff --git a/11c-spatial_aggregations_on_regions.asciidoc b/12c-geospatial-aggregations_on_regions.asciidoc similarity index 100% rename from 11c-spatial_aggregations_on_regions.asciidoc rename to 12c-geospatial-aggregations_on_regions.asciidoc diff --git a/11c-geospatial_mechanics.asciidoc b/12c-geospatial-mechanics.asciidoc similarity index 100% rename from 11c-geospatial_mechanics.asciidoc rename to 12c-geospatial-mechanics.asciidoc diff --git a/11d-quadtiles.asciidoc b/12d-geospatial-quadtiles.asciidoc similarity index 100% rename from 11d-quadtiles.asciidoc rename to 12d-geospatial-quadtiles.asciidoc diff --git a/11e-weather_near_you.asciidoc b/12e-geospatial-weather_near_you.asciidoc similarity index 100% rename from 11e-weather_near_you.asciidoc rename to 12e-geospatial-weather_near_you.asciidoc diff --git a/11g-end_of_geographic.asciidoc b/12g-geospatial-end.asciidoc similarity index 100% rename from 11g-end_of_geographic.asciidoc rename to 12g-geospatial-end.asciidoc diff --git a/11z-spatial_manor-data.asciidoc b/12z-geospatial-spatial_manor-data.asciidoc similarity index 100% rename from 11z-spatial_manor-data.asciidoc rename to 12z-geospatial-spatial_manor-data.asciidoc diff --git a/12-text_analysis.asciidoc b/13-text_analysis.asciidoc similarity index 100% rename from 12-text_analysis.asciidoc rename to 13-text_analysis.asciidoc diff --git a/16-part_four.asciidoc b/16-part_four.asciidoc deleted file mode 100644 index 0c47207..0000000 --- a/16-part_four.asciidoc +++ /dev/null @@ -1 +0,0 @@ -= Internals: How Hadoop and Map/Reduce Work diff --git a/40-4-part_four-practicalities.asciidoc b/40-4-part_four-practicalities.asciidoc new file mode 100644 index 0000000..29bb553 --- /dev/null +++ b/40-4-part_four-practicalities.asciidoc @@ -0,0 +1 @@ += Practicalities diff --git a/07-big_data_ecosystem.asciidoc b/41-big_data_ecosystem.asciidoc similarity index 100% rename from 07-big_data_ecosystem.asciidoc rename to 41-big_data_ecosystem.asciidoc diff --git a/14-organizing_data.asciidoc b/42-organizing_data.asciidoc similarity index 100% rename from 14-organizing_data.asciidoc rename to 42-organizing_data.asciidoc diff --git a/11f-data_formats.asciidoc b/42a-data_formats.asciidoc similarity index 100% rename from 11f-data_formats.asciidoc rename to 42a-data_formats.asciidoc diff --git a/15-filesystem_mojo.asciidoc b/43-filesystem_mojo.asciidoc similarity index 100% rename from 15-filesystem_mojo.asciidoc rename to 43-filesystem_mojo.asciidoc diff --git a/13-data_munging.asciidoc b/45-data_munging.asciidoc similarity index 100% rename from 13-data_munging.asciidoc rename to 45-data_munging.asciidoc diff --git a/13a-wikipedia_other.asciidoc b/45a-wikipedia_other.asciidoc similarity index 100% rename from 13a-wikipedia_other.asciidoc rename to 45a-wikipedia_other.asciidoc diff --git a/11b-spatial_aggregation-points.asciidoc b/45b-spatial_aggregation-points.asciidoc similarity index 100% rename from 11b-spatial_aggregation-points.asciidoc rename to 45b-spatial_aggregation-points.asciidoc diff --git a/13c-wikipedia_corpus.asciidoc b/45c-wikipedia_corpus.asciidoc similarity index 100% rename from 13c-wikipedia_corpus.asciidoc rename to 45c-wikipedia_corpus.asciidoc diff --git a/13d-munging-patterns.asciidoc b/45d-munging-patterns.asciidoc similarity index 100% rename from 13d-munging-patterns.asciidoc rename to 45d-munging-patterns.asciidoc diff --git a/13e-airline_flights.asciidoc b/45e-airline_flights.asciidoc similarity index 100% rename from 13e-airline_flights.asciidoc rename to 45e-airline_flights.asciidoc diff --git a/13f-daily_weather.asciidoc b/45f-daily_weather.asciidoc similarity index 100% rename from 13f-daily_weather.asciidoc rename to 45f-daily_weather.asciidoc diff --git a/13h-other_strategies.asciidoc b/45h-other_strategies.asciidoc similarity index 100% rename from 13h-other_strategies.asciidoc rename to 45h-other_strategies.asciidoc diff --git a/50-5-part_five-internals_and_tuning.asciidoc b/50-5-part_five-internals_and_tuning.asciidoc new file mode 100644 index 0000000..2a14179 --- /dev/null +++ b/50-5-part_five-internals_and_tuning.asciidoc @@ -0,0 +1 @@ += Internals and Tuning diff --git a/18-java_api.asciidoc b/51-java_api.asciidoc similarity index 100% rename from 18-java_api.asciidoc rename to 51-java_api.asciidoc diff --git a/19-advanced_pig.asciidoc b/52-advanced_pig.asciidoc similarity index 100% rename from 19-advanced_pig.asciidoc rename to 52-advanced_pig.asciidoc diff --git a/19a-advanced_pig.asciidoc b/52a-advanced_pig.asciidoc similarity index 100% rename from 19a-advanced_pig.asciidoc rename to 52a-advanced_pig.asciidoc diff --git a/22a-tuning-wise_and_lazy.asciidoc b/52a-tuning-wise_and_lazy.asciidoc similarity index 100% rename from 22a-tuning-wise_and_lazy.asciidoc rename to 52a-tuning-wise_and_lazy.asciidoc diff --git a/19b-pig_udfs.asciidoc b/52b-pig_udfs.asciidoc similarity index 100% rename from 19b-pig_udfs.asciidoc rename to 52b-pig_udfs.asciidoc diff --git a/12-hadoop_internals-just_enough_for_now.asciidoc b/53-hadoop_internals-just_enough_for_now.asciidoc similarity index 100% rename from 12-hadoop_internals-just_enough_for_now.asciidoc rename to 53-hadoop_internals-just_enough_for_now.asciidoc diff --git a/23-hadoop_tuning-brave_and_foolish.asciidoc b/53-hadoop_tuning-brave_and_foolish.asciidoc similarity index 100% rename from 23-hadoop_tuning-brave_and_foolish.asciidoc rename to 53-hadoop_tuning-brave_and_foolish.asciidoc diff --git a/21b-hadoop_internals-map_reduce.asciidoc b/53b-hadoop_internals-map_reduce.asciidoc similarity index 100% rename from 21b-hadoop_internals-map_reduce.asciidoc rename to 53b-hadoop_internals-map_reduce.asciidoc diff --git a/22b-tuning-pathology.asciidoc b/53b-tuning-pathology.asciidoc similarity index 100% rename from 22b-tuning-pathology.asciidoc rename to 53b-tuning-pathology.asciidoc diff --git a/22d-use_method_checklist.asciidoc b/53d-use_method_checklist.asciidoc similarity index 100% rename from 22d-use_method_checklist.asciidoc rename to 53d-use_method_checklist.asciidoc diff --git a/ha1b-hadoop_internals-logs.asciidoc b/54-hadoop_internals-logs.asciidoc similarity index 100% rename from ha1b-hadoop_internals-logs.asciidoc rename to 54-hadoop_internals-logs.asciidoc diff --git a/21-hadoop_internals.asciidoc b/54-hadoop_internals.asciidoc similarity index 100% rename from 21-hadoop_internals.asciidoc rename to 54-hadoop_internals.asciidoc diff --git a/22-hadoop_tuning.asciidoc b/54-hadoop_tuning.asciidoc similarity index 100% rename from 22-hadoop_tuning.asciidoc rename to 54-hadoop_tuning.asciidoc diff --git a/20-hbase_data_modeling.asciidoc b/55-hbase_data_modeling.asciidoc similarity index 100% rename from 20-hbase_data_modeling.asciidoc rename to 55-hbase_data_modeling.asciidoc diff --git a/20a-hbase_schema.asciidoc b/55a-hbase_schema.asciidoc similarity index 100% rename from 20a-hbase_schema.asciidoc rename to 55a-hbase_schema.asciidoc diff --git a/25-appendix.asciidoc b/80-appendix.asciidoc similarity index 100% rename from 25-appendix.asciidoc rename to 80-appendix.asciidoc diff --git a/27-exercises.asciidoc b/83-exercises.asciidoc similarity index 100% rename from 27-exercises.asciidoc rename to 83-exercises.asciidoc diff --git a/23a-overview_of_datasets.asciidoc b/83a-overview_of_datasets.asciidoc similarity index 100% rename from 23a-overview_of_datasets.asciidoc rename to 83a-overview_of_datasets.asciidoc diff --git a/25c-references.asciidoc b/83c-references.asciidoc similarity index 100% rename from 25c-references.asciidoc rename to 83c-references.asciidoc diff --git a/25d-overview_of_scripts.asciidoc b/84d-overview_of_scripts.asciidoc similarity index 100% rename from 25d-overview_of_scripts.asciidoc rename to 84d-overview_of_scripts.asciidoc diff --git a/25f-glossary.asciidoc b/85f-glossary.asciidoc similarity index 100% rename from 25f-glossary.asciidoc rename to 85f-glossary.asciidoc diff --git a/26-back_cover.asciidoc b/86-back_cover.asciidoc similarity index 100% rename from 26-back_cover.asciidoc rename to 86-back_cover.asciidoc diff --git a/E_and_C.md b/E_and_C.md new file mode 100644 index 0000000..f1e5275 --- /dev/null +++ b/E_and_C.md @@ -0,0 +1,12 @@ + +==== Olga and the Calculating Pigs + +* ... +* + +==== Elephant Courtship Rituals + + + +Each bull + diff --git a/02-feedback_and_response.asciidoc b/attic/02-feedback_and_response.asciidoc similarity index 100% rename from 02-feedback_and_response.asciidoc rename to attic/02-feedback_and_response.asciidoc diff --git a/08-intro_to_storm+trident.asciidoc b/attic/08-intro_to_storm+trident.asciidoc similarity index 100% rename from 08-intro_to_storm+trident.asciidoc rename to attic/08-intro_to_storm+trident.asciidoc diff --git a/09x-statistics-to_integrate.asciidoc b/attic/11-statistics.asciidoc similarity index 100% rename from 09x-statistics-to_integrate.asciidoc rename to attic/11-statistics.asciidoc diff --git a/16-conceptual_model.asciidoc b/attic/16-conceptual_model.asciidoc similarity index 100% rename from 16-conceptual_model.asciidoc rename to attic/16-conceptual_model.asciidoc diff --git a/17-machine_learning.asciidoc b/attic/17-machine_learning.asciidoc similarity index 100% rename from 17-machine_learning.asciidoc rename to attic/17-machine_learning.asciidoc diff --git a/24-storm+trident-internals.asciidoc b/attic/24-storm+trident-internals.asciidoc similarity index 100% rename from 24-storm+trident-internals.asciidoc rename to attic/24-storm+trident-internals.asciidoc diff --git a/24-storm+trident-topology.graffle b/attic/24-storm+trident-topology.graffle similarity index 100% rename from 24-storm+trident-topology.graffle rename to attic/24-storm+trident-topology.graffle diff --git a/24-storm+trident-topology.png b/attic/24-storm+trident-topology.png similarity index 100% rename from 24-storm+trident-topology.png rename to attic/24-storm+trident-topology.png diff --git a/24a-storm+trident-overview.asciidoc b/attic/24a-storm+trident-overview.asciidoc similarity index 100% rename from 24a-storm+trident-overview.asciidoc rename to attic/24a-storm+trident-overview.asciidoc diff --git a/25-storm+trident-tuning.asciidoc b/attic/25-storm+trident-tuning.asciidoc similarity index 100% rename from 25-storm+trident-tuning.asciidoc rename to attic/25-storm+trident-tuning.asciidoc diff --git a/90-style_guide.asciidoc b/attic/90-style_guide.asciidoc similarity index 100% rename from 90-style_guide.asciidoc rename to attic/90-style_guide.asciidoc diff --git a/99-dumping_ground.asciidoc b/attic/99-dumping_ground.asciidoc similarity index 100% rename from 99-dumping_ground.asciidoc rename to attic/99-dumping_ground.asciidoc diff --git a/cheatsheets/26-cheatsheet-sql_hive_pig.asciidoc b/cheatsheets/26-cheatsheet-sql_hive_pig.asciidoc new file mode 100644 index 0000000..614133c --- /dev/null +++ b/cheatsheets/26-cheatsheet-sql_hive_pig.asciidoc @@ -0,0 +1,17 @@ + + +=== SQL-to-Pig-to-Hive Cheatsheet + +* SELECT..WHERE +* SELECT...LIMit +* GROUP BY...HAVING +* SELECT WHERE... ORDER BY +* SELECT WHERE... SORT BY (just use reducer sort) ~~ (does reducer in Pig guarantee this?) +* SELECT … DISTRIBUTE BY … SORT BY ... +* SELECT ... CLUSTER BY (equiv of distribute by X sort by X) +* Indexing tips +* CASE...when...then +* Block Sampling / Input pruning +* SELECT country_name, indicator_name, `2011` AS trade_2011 FROM wdi WHERE (indicator_name = 'Trade (% of GDP)' OR indicator_name = 'Broad money (% of GDP)') AND `2011` IS NOT NULL CLUSTER BY indicator_name; + +SELECT columns or computations FROM table WHERE condition GROUP BY columns HAVING condition ORDER BY column [ASC | DESC] LIMIT offset,count; diff --git a/99-business_applications_of_hadoop.asciidoc b/supplementary/99-business_applications_of_hadoop.asciidoc similarity index 100% rename from 99-business_applications_of_hadoop.asciidoc rename to supplementary/99-business_applications_of_hadoop.asciidoc