

transform ( testData ) # Select example rows to display. dt = DecisionTreeClassifier ( labelCol = "indexedLabel", featuresCol = "indexedFeatures" ) # Chain indexers and tree in a Pipeline pipeline = Pipeline ( stages = ) # Train model. randomSplit (, 1234 ) # Create a DecisionTree model. fit ( data ) # Split the data into training and test sets (40% held out for testing) ( trainingData, testData ) = data. VectorIndexer ( inputCol = "features", outputCol = "indexedFeatures", maxCategories = 4 ). # We specify maxCategories so features with > 4 distinct values are treated as continuous.

fit ( data ) # Automatically identify categorical features, and index them. labelIndexer = StringIndexer ( inputCol = "label", outputCol = "indexedLabel" ). # Fit on whole dataset to include all labels in index. load ( "data/sample_multiclass_classification_data.txt" ) # Index labels, adding metadata to the label column. appName ( "DecisionTreeMulticlassClassificationExample" )\ evaluate ( predictions ) print ( "Test set accuracy = " + str ( accuracy )) treeModel = model. show ( 5 ) # Select (prediction, true label) and compute accuracy evaluator = MulticlassClassificationEvaluator ( labelCol = "indexedLabel", predictionCol = "prediction", metricName = "accuracy" ) accuracy = evaluator. select ( "prediction", "indexedLabel", "features" ). fit ( data ) # Split the data into training and test sets (30% held out for testing) ( trainingData, testData ) = data. load ( "data/sample_libsvm_data.txt" ) # Index labels, adding metadata to the label column. appName ( "DecisionTreeBinar圜lassificationExample" )\

From pyspark.ml import Pipeline from pyspark.ml.classification import DecisionTreeClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.sql import SparkSession spark = SparkSession\
