Каковы различные типы соединений в Spark?
Я просмотрел документы, и он говорит, что поддерживаются следующие типы соединений:
Тип соединения для выполнения. По умолчанию внутренний. Должен быть один из: внутренний, крест, внешний, полный, full_outer, left, left_outer, right, right_outer, left_semi, left_anti.
Я посмотрел на qaru.site/info/35551/... на SQL-соединения, а верхняя пара ответов не упоминает некоторые из соединений сверху, например. left_semi
и left_anti
. Что они означают в Spark?
Ответы
Ответ 1
Вот простой иллюстративный эксперимент:
import org.apache.spark.sql._
object SparkSandbox extends App {
private[this] implicit val spark = SparkSession.builder().master("local[*]").getOrCreate()
import spark.implicits._
spark.sparkContext.setLogLevel("ERROR")
val left = Seq((1, "A1"), (2, "A2"), (3, "A3"), (4, "A4")).toDF("id", "value")
val right = Seq((3, "A3"), (4, "A4"), (4, "A4_1"), (5, "A5"), (6, "A6")).toDF("id", "value")
println("LEFT")
left.orderBy("id").show()
println("RIGHT")
right.orderBy("id").show()
val joinTypes = Seq("inner", "outer", "full", "full_outer", "left", "left_outer", "right", "right_outer", "left_semi", "left_anti")
joinTypes foreach { joinType =>
println(s"${joinType.toUpperCase()} JOIN")
left.join(right = right, usingColumns = Seq("id"), joinType = joinType).orderBy("id").show()
}
}
Выход
LEFT
+---+-----+
| id|value|
+---+-----+
| 1| A1|
| 2| A2|
| 3| A3|
| 4| A4|
+---+-----+
RIGHT
+---+-----+
| id|value|
+---+-----+
| 3| A3|
| 4| A4|
| 4| A4_1|
| 5| A5|
| 6| A6|
+---+-----+
INNER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 3| A3| A3|
| 4| A4| A4_1|
| 4| A4| A4|
+---+-----+-----+
OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 1| A1| null|
| 2| A2| null|
| 3| A3| A3|
| 4| A4| A4|
| 4| A4| A4_1|
| 5| null| A5|
| 6| null| A6|
+---+-----+-----+
FULL JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 1| A1| null|
| 2| A2| null|
| 3| A3| A3|
| 4| A4| A4|
| 4| A4| A4_1|
| 5| null| A5|
| 6| null| A6|
+---+-----+-----+
FULL_OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 1| A1| null|
| 2| A2| null|
| 3| A3| A3|
| 4| A4| A4|
| 4| A4| A4_1|
| 5| null| A5|
| 6| null| A6|
+---+-----+-----+
LEFT JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 1| A1| null|
| 2| A2| null|
| 3| A3| A3|
| 4| A4| A4_1|
| 4| A4| A4|
+---+-----+-----+
LEFT_OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 1| A1| null|
| 2| A2| null|
| 3| A3| A3|
| 4| A4| A4_1|
| 4| A4| A4|
+---+-----+-----+
RIGHT JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 3| A3| A3|
| 4| A4| A4_1|
| 4| A4| A4|
| 5| null| A5|
| 6| null| A6|
+---+-----+-----+
RIGHT_OUTER JOIN
+---+-----+-----+
| id|value|value|
+---+-----+-----+
| 3| A3| A3|
| 4| A4| A4|
| 4| A4| A4_1|
| 5| null| A5|
| 6| null| A6|
+---+-----+-----+
LEFT_SEMI JOIN
+---+-----+
| id|value|
+---+-----+
| 3| A3|
| 4| A4|
+---+-----+
LEFT_ANTI JOIN
+---+-----+
| id|value|
+---+-----+
| 1| A1|
| 2| A2|
+---+-----+
Ответ 2
Здесь наглядное объяснение различных типов типов соединений, доступных в Spark. ![]()
Посмотрите этот замечательный блог о присоединениях Spark: http://kirillpavlov.com/blog/2016/04/23/beyond-traditional-join-with-apache-spark/
Ответ 3
Любимый пример Патикрит. Вот возможный перевод на Java с использованием Spark v2 и фреймов данных, включая перекрестное соединение.
package net.jgp.books.sparkInAction.ch12.lab940AllJoins;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
/**
* All joins in a single app, inspired by
* https://stackoverflow.com/questions/45990633/what-are-the-various-join-types-in-spark.
*
* Used in Spark in Action 2e, http://jgp.net/sia
*
* @author jgp
*/
public class AllJoinsApp {
/**
* main() is your entry point to the application.
*
* @param args
*/
public static void main(String[] args) {
AllJoinsApp app = new AllJoinsApp();
app.start();
}
/**
* The processing code.
*/
private void start() {
// Creates a session on a local master
SparkSession spark = SparkSession.builder()
.appName("Processing of invoices")
.master("local")
.getOrCreate();
StructType schema = DataTypes.createStructType(new StructField[] {
DataTypes.createStructField(
"id",
DataTypes.IntegerType,
false),
DataTypes.createStructField(
"value",
DataTypes.StringType,
false) });
List<Row> rows = new ArrayList<Row>();
rows.add(RowFactory.create(1, "A1"));
rows.add(RowFactory.create(2, "A2"));
rows.add(RowFactory.create(3, "A3"));
rows.add(RowFactory.create(4, "A4"));
Dataset<Row> dfLeft = spark.createDataFrame(rows, schema);
dfLeft.show();
rows = new ArrayList<Row>();
rows.add(RowFactory.create(3, "A3"));
rows.add(RowFactory.create(4, "A4"));
rows.add(RowFactory.create(4, "A4_1"));
rows.add(RowFactory.create(5, "A5"));
rows.add(RowFactory.create(6, "A6"));
Dataset<Row> dfRight = spark.createDataFrame(rows, schema);
dfRight.show();
String[] joinTypes = new String[] {
"inner", // v2.0.0. default
"cross", // v2.2.0
"outer", // v2.0.0
"full", // v2.1.1
"full_outer", // v2.1.1
"left", // v2.1.1
"left_outer", // v2.0.0
"right", // v2.1.1
"right_outer", // v2.0.0
"left_semi", // v2.0.0, was leftsemi before v2.1.1
"left_anti" // v2.1.1
};
for (String joinType : joinTypes) {
System.out.println(joinType.toUpperCase() + " JOIN");
Dataset<Row> df = dfLeft.join(
dfRight,
dfLeft.col("id").equalTo(dfRight.col("id")),
joinType);
df.orderBy(dfLeft.col("id")).show();
}
}
}
Я помещу этот пример в репозиторий Spark in Action, 2e, глава 12.
Ответ 4
Spark data frame support following types of joins between two dataframes.
Please find the list of joins and joining string with respect to join types along with scala syntax.
We can use following joining values used for specify the join type in Scala- Spark code.
***Mathod:*** Leftdataframe.join(Rightdataframe, join_conditions, joinStringName)
Join Name : Join String name in scala -Spark code
1. inner : 'inner'
2. cross: 'cross'
3. outer: 'outer'
4. full: 'full'
5. full outer: 'fullouter'
6. left : 'left'
7. left outer : 'leftouter'
8. right : 'right'
9. right outer : 'rightouter'
10. left semi: 'leftsemi'
11. left anti: 'leftanti'
example: 1. Left Semi join:
Leftdataframe.join(Rightdataframe, join_conditions, "leftsemi");
2. inner Join Example:
Leftdataframe.join(Rightdataframe, join_conditions, "inner");
Its tested and working well.
Ответ 5
В Spark-Sql доступны различные типы соединений, которые перечислены ниже для более подробной информации. Ссылка и пример кодирования в github Ссылка
Присоединяется
1) JOIN
2) {LEFT|RIGHT|FULL} OUTER JOIN
3) LEFT SEMI JOIN
4) CROSS JOIN
Пример:
package org.apache.spark.sql.catalyst.plans
import java.util.Locale
import org.apache.spark.sql.catalyst.expressions.Attribute
object JoinType {
def apply(typ: String): JoinType = typ.toLowerCase(Locale.ROOT).replace("_", "") match {
case "inner" => Inner
case "outer" | "full" | "fullouter" => FullOuter
case "leftouter" | "left" => LeftOuter
case "rightouter" | "right" => RightOuter
case "leftsemi" => LeftSemi
case "leftanti" => LeftAnti
case "cross" => Cross
case _ =>
val supported = Seq(
"inner",
"outer", "full", "fullouter", "full_outer",
"leftouter", "left", "left_outer",
"rightouter", "right", "right_outer",
"leftsemi", "left_semi",
"leftanti", "left_anti",
"cross")
throw new IllegalArgumentException(s"Unsupported join type '$typ'. " +
"Supported join types include: " + supported.mkString("'", "', '", "'") + ".")
}
}
sealed abstract class JoinType {
def sql: String
}
/**
* The explicitCartesian flag indicates if the inner join was constructed with a CROSS join
* indicating a cartesian product has been explicitly requested.
*/
sealed abstract class InnerLike extends JoinType {
def explicitCartesian: Boolean
}
case object Inner extends InnerLike {
override def explicitCartesian: Boolean = false
override def sql: String = "INNER"
}
case object Cross extends InnerLike {
override def explicitCartesian: Boolean = true
override def sql: String = "CROSS"
}
case object LeftOuter extends JoinType {
override def sql: String = "LEFT OUTER"
}
case object RightOuter extends JoinType {
override def sql: String = "RIGHT OUTER"
}
case object FullOuter extends JoinType {
override def sql: String = "FULL OUTER"
}
case object LeftSemi extends JoinType {
override def sql: String = "LEFT SEMI"
}
case object LeftAnti extends JoinType {
override def sql: String = "LEFT ANTI"
}
case class ExistenceJoin(exists: Attribute) extends JoinType {
override def sql: String = {
// This join type is only used in the end of optimizer and physical plans, we will not
// generate SQL for this join type
throw new UnsupportedOperationException
}
}
case class NaturalJoin(tpe: JoinType) extends JoinType {
require(Seq(Inner, LeftOuter, RightOuter, FullOuter).contains(tpe),
"Unsupported natural join type " + tpe)
override def sql: String = "NATURAL " + tpe.sql
}
case class UsingJoin(tpe: JoinType, usingColumns: Seq[String]) extends JoinType {
require(Seq(Inner, LeftOuter, LeftSemi, RightOuter, FullOuter, LeftAnti).contains(tpe),
"Unsupported using join type " + tpe)
override def sql: String = "USING " + tpe.sql
}
object LeftExistence {
def unapply(joinType: JoinType): Option[JoinType] = joinType match {
case LeftSemi | LeftAnti => Some(joinType)
case j: ExistenceJoin => Some(joinType)
case _ => None
}
}
См. примеры примеров stackoverflow, используя ссылку
Ответ 6
Просьба отослать этот пост в блоге от меня:
https://medium.com/@vishwajeet.pol/joins-in-apache-spark-may-that-be-warehouse-or-bigdata-joins-will-always-there-2d9d16b884b3
Ответ 7
Left Semi возвращает строки, в которых ключ объединения находится в обеих таблицах, но он включает только поля из левой таблицы.
Left Anti возвращает строки, в которых ключ объединения находится только в левой таблице.
Хорошие описания различных типов соединений:https://www.cloudera.com/documentation/enterprise/latest/topics/impala_joins.html